Figure 1: Decomposition of model-based mean expression of mixture samples on a certain gene. Each donut represents a sample, and the total volume is proportional to gene-level mean expression in terms of read counts. The central angle of each slice of a donut is proportional to the cellular fractions of each cell type. The donut area is proportional to sample read depth adjusted by cell type-independent effects. The depth of each slice is proportional to the cell-type specific expression. Three samples from the case group and three samples of the control group are illustrated. The underlying cell type-specific expression says that cell type 2 and cell type 3 are not differentially expressed in the gene, while the gene expression of cell type 1 in the case group doubles when compared to the control group.
See the Pen Control3 by chongjin (@chongjin) on CodePen.
pdf("figures/barplot_3d_unedited.pdf", width = 12, height = 8)
par(mfrow=c(1,6), bty="n", mar=c(5,4,2,1))
# case
CARseq:::barplot_3d(x = c(0.45, 0.35, 0.2), y = c(3, 2, 1), z = 1,
xlim = c(0, 1.25), ylim = c(0, 3.2))
CARseq:::barplot_3d(x = c(0.6, 0.1, 0.3), y = c(3, 2, 1), z = 2,
xlim = c(0, 1.25), ylim = c(0, 3.2))
CARseq:::barplot_3d(x = c(0.7, 0.15, 0.15), y = c(3, 2, 1), z = 3,
xlim = c(0, 1.25), ylim = c(0, 3.2))
# control
CARseq:::barplot_3d(x = c(0.45, 0.35, 0.2), y = c(1.5, 2, 1), z = 1,
xlim = c(0, 1.25), ylim = c(0, 3.2))
CARseq:::barplot_3d(x = c(0.6, 0.1, 0.3), y = c(1.5, 2, 1), z = 2,
xlim = c(0, 1.25), ylim = c(0, 3.2))
CARseq:::barplot_3d(x = c(0.7, 0.15, 0.15), y = c(1.5, 2, 1), z = 3,
xlim = c(0, 1.25), ylim = c(0, 3.2))
dev.off()## png
## 2
set.seed(1234)
autism_working_dir = "."
scz_working_dir = "."
cache_dir = "reproducible_figures_cache"
if (!file.exists(cache_dir)) dir.create(cache_dir)
############################################################
# 4 seqSVs
############################################################
# CARseq, ICeDT
# not permuted
res_CARseq = readRDS(file.path(autism_working_dir, "results/ASD_CARseq_ICeDT_seqSV4.rds"))
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_CARseq$p))) {
hist(res_CARseq$p[,k], main=colnames(res_CARseq$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_CARseq$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}# permuted
res_CARseq_permuted = readRDS(file.path(autism_working_dir, "results/ASD_CARseq_ICeDT_permuted_seqSV4.rds"))
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_CARseq_permuted$p))) {
hist(res_CARseq_permuted$p[,k], main=colnames(res_CARseq_permuted$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_CARseq_permuted$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}The reactome pathways are downloaded from MSigDB website. They are in gene symbols.
autism_working_dir = "."
# load pathways:
gmtfile_go_bp = file.path(autism_working_dir, "data/c5.bp.v7.1.symbols.gmt")
gmtfile_reactome = file.path(autism_working_dir, "data/c2.cp.reactome.v7.1.symbols.gmt")
pathways_go_bp = gmtPathways(gmtfile_go_bp)
pathways_reactome = gmtPathways(gmtfile_reactome)The number and size of pathways are listed here:
## List of 1532
## List of 7530
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 5 7 9 12 17 23 31 44 72 120 1470
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 5 6 8 11 14 20 30 48 89 198 1994
We use “classic” weight where no weight is added. The genes are ranked by p-values. The gene matching is done in gene symbol. There is a lot of genes with p-values being 1, which “may or may not cause trouble”, so we only care about the genes that are enriched in small p-values:
############################################################
# 4 seqSVs
############################################################
# CARseq, ICeDT
# not permuted
res_TOAST = list()
res_TOAST$p = read.table(file.path(autism_working_dir, "results/ASD_step1_TOAST_with_covariates_seqSV4.txt"))
# png("figures/CARseq_ICeDT_SV2.png",
# width=6, height=6, units="in", res=400)
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_TOAST$p))) {
hist(res_TOAST$p[,k], main=colnames(res_TOAST$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_TOAST$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}# dev.off()
# permuted
res_TOAST_permuted = list()
res_TOAST_permuted$p = read.table(file.path(autism_working_dir, "results/ASD_step1_TOAST_with_covariates_seqSV4_permuted.txt"))
# png("figures/CARseq_ICeDT_permuted_SV2.png",
# width=6, height=6, units="in", res=400)
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_TOAST_permuted$p))) {
hist(res_TOAST_permuted$p[,k], main=colnames(res_TOAST_permuted$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_TOAST_permuted$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}# Running fgsea:
pval = -log10(res_TOAST$p[, "Astro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.0182 0.18455 0.08314 0.36429 0.00921 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_TOAST_GSEAMultilevel_REACTOME_Astro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.67 1 0.067 -0.039 -0.85 365
## leadingEdge
## 1: HCN3,PTPRD,FLOT2,CACNB4,KCNH8,KCNN2,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Exc"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.05494 0.10864 0.00733 0.62502 0.14645 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_TOAST_GSEAMultilevel_REACTOME_Exc.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.009 0.27 0.38 0.088 2 365
## leadingEdge
## 1: NRGN,SHANK2,GRIN2C,GNG3,DLG4,IL1RAP,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Inh"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.125 0.0602 0.2203 0.4367 0.016 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_TOAST_GSEAMultilevel_REACTOME_Inh.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.92 0.99 0.047 0.029 0.63 365
## leadingEdge
## 1: SYT10,LRFN1,GRIN2D,SHANK2,PRKAG2,SLITRK5,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Oligo"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.4141 0.1127 0.0554 0.2569 0.5649 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_TOAST_GSEAMultilevel_REACTOME_Oligo.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.52 0.91 0.08 -0.044 -0.95 365
## leadingEdge
## 1: SIPA1L1,APBA1,KCNMB4,ARL6IP5,BEGAIN,HSPA8,...
# Running fgsea:
# shrunken_lfc = abs(res_TOAST$shrunken_lfc[, "Control_vs_ASD:Micro"]) # abs of shrunken LFC
pval = -log10(res_TOAST$p[, "Micro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.8815 0.0281 0.2431 0.3933 0.5202 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_TOAST_GSEAMultilevel_REACTOME_Micro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.38 0.78 0.092 0.047 1 365
## leadingEdge
## 1: SNAP25,CPLX1,GNAI3,HCN3,CHRNA2,PRKAB2,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "OPC"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.0051 0.1069 0.166 0.055 0.1749 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_TOAST_GSEAMultilevel_REACTOME_OPC.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.13 0.7 0.18 -0.061 -1.3 365
## leadingEdge
## 1: NCALD,PRKCB,APBA1,KCNG1,KCNS1,SYT12,...
We use “classic” weight where no weight is added. The genes are ranked by p-values. The gene matching is done in gene symbol. There is a lot of genes with p-values being 1, which “may or may not cause trouble”, so we only care about the genes that are enriched in small p-values:
# Running fgsea:
pval = -log10(res_CARseq$p[, "Astro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.901 0.178 0.128 0 0.744 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_REACTOME_Astro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 1.7e-12 1.9e-09 0.91 -0.19 -4.3 363
## leadingEdge
## 1: GRIN2B,TUBB3,VAMP2,GRIN3A,LRRTM3,CALM1,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Exc"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.219 0.759 0.59 0 0.139 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_REACTOME_Exc.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 2.5e-07 0.00028 0.67 0.15 3.2 363
## leadingEdge
## 1: KCNC4,GRIA3,KCNQ2,LRRTM3,SRC,ADCY5,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Inh"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 1.533 1.104 0.274 0.165 0 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_REACTOME_Inh.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 2.4e-06 0.00016 0.63 0.13 2.9 363
## leadingEdge
## 1: KCNN1,GRIA3,CACNB3,NLGN1,LRRTM3,PANX2,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Oligo"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.445 0.326 0 0.373 0 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_REACTOME_Oligo.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.00024 0.006 0.52 -0.11 -2.5 363
## leadingEdge
## 1: GRIN2B,NRAS,CACNA1E,DLGAP2,KCNMB2,SIPA1L1,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Micro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.155 0 0.112 0.812 0.204 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_REACTOME_Micro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 4.2e-11 2.2e-09 0.85 -0.18 -4.1 363
## leadingEdge
## 1: GRIN2B,VAMP2,GABBR1,GRIN3A,LRRTM3,UNC13B,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "OPC"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0 0 0 0 0.146 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_REACTOME_OPC.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 1.1e-05 0.00044 0.59 0.13 2.8 363
## leadingEdge
## 1: KCNJ16,BCHE,GNB2,SHANK1,SLC6A13,PRKAB1,...
# Running fgsea:
res_DESeq2 = list()
res_DESeq2$p = read.table(file.path(autism_working_dir, "results/ASD_step1_DESeq2_bulk_adj_covariates_seqSV4_lfcShrink.txt"))
res_DESeq2$p_permuted = read.table(file.path(autism_working_dir, "results/ASD_step1_DESeq2_bulk_adj_covariates_seqSV4_permuted.txt"))
pval = -log10(res_DESeq2$p[, "pvalue"])
# plot distribution of p-values
opar = par(mfrow=c(1,2), bty="n", mar=c(5,4,2,1))
hist(res_DESeq2$p[, "pvalue"], main="DESeq2 bulk", breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_DESeq2$p[, "pvalue"]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
hist(res_DESeq2$p_permuted[, "pvalue"], main="DESeq2 bulk permuted", breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_DESeq2$p_permuted[, "pvalue"]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_DESeq2$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 3.795 0.157 0.645 0.16 0.636 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_DESeq2_GSEAMultilevel_REACTOME.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.0063 0.046 0.41 0.093 2.1 365
## leadingEdge
## 1: GNG12,HTR3B,MAOA,TUBB2B,SYT12,SLC1A3,...
https://gene-archive.sfari.org/database/gene-scoring/
SFARI_table = read.csv(file.path(autism_working_dir, "data/SFARI-Gene_genes_10-31-2019release_04-08-2020export.csv"), as.is=TRUE)
dim(SFARI_table)## [1] 980 8
SFARI_table = SFARI_table[SFARI_table$gene.symbol %in% rowData(rse_filtered)$gene_name, ]
dim(SFARI_table)## [1] 883 8
##
## 1 2 3 4 5 6
## 0 10 43 168 374 156 22
## 1 14 21 16 38 0 0
genes_in_SFARI = SFARI_table$gene.symbol[SFARI_table$syndromic == 1 | SFARI_table$gene.score %in% 1:3]
genes_in_SFARI## [1] "ACHE" "ADA" "ADCY3" "ADNP" "AGAP2" "AGO1"
## [7] "AHDC1" "AKAP9" "AMT" "ANK2" "ANK3" "ANKRD11"
## [13] "ANXA1" "APBB1" "APH1A" "ARHGEF9" "ARID1B" "AGO4"
## [19] "ASAP2" "ASH1L" "ASPM" "ASTN2" "ASXL3" "ALG6"
## [25] "ATP10A" "ATP1A3" "ATP2B2" "AUTS2" "BAZ2B" "BCKDK"
## [31] "BCL11A" "ATP1A1" "BTAF1" "CACNA1D" "CACNA1E" "CACNA1H"
## [37] "CACNA2D3" "CACNB2" "CAMK2A" "CAPRIN1" "CC2D1A" "CCT4"
## [43] "CDC42BPB" "CELF4" "CEP135" "CEP290" "CEP41" "CGNL1"
## [49] "CHD1" "CHD2" "CHD8" "CASZ1" "CCNG1" "CCNK"
## [55] "CDH13" "CHD3" "CHMP1A" "CHRNA7" "CIB2" "CIC"
## [61] "CLASP1" "CNKSR2" "CNOT3" "CNR1" "CNTN4" "CNTN5"
## [67] "CNTN6" "CNTNAP2" "CNTNAP4" "CTCF" "CTNNB1" "CTNND2"
## [73] "CTTNBP2" "CUL3" "CUL7" "CUX1" "CPEB4" "CTNNA2"
## [79] "CUX2" "CYFIP1" "CYP27A1" "DAPP1" "DDX3X" "DEAF1"
## [85] "DENR" "DHX30" "DIP2A" "DIP2C" "DISC1" "DLGAP1"
## [91] "DNMT3A" "DOCK8" "DPP10" "DPYSL2" "DSCAM" "DYNC1H1"
## [97] "DYRK1A" "EFR3A" "EHMT1" "ELAVL3" "ELP4" "EP300"
## [103] "EP400" "ETFB" "FBN1" "FBXO11" "FOXP1" "FOXP2"
## [109] "GABBR2" "GABRB3" "GATM" "GGNBP2" "GIGYF1" "GIGYF2"
## [115] "GPC4" "GPHN" "GRIA1" "GRID1" "GRIK2" "GRIK5"
## [121] "GRIN1" "GRIN2A" "GABRG3" "GALNT8" "GRIN2B" "GRIP1"
## [127] "HECTD4" "HECW2" "HIVEP3" "HMGN1" "HNRNPU" "ICA1"
## [133] "HDAC8" "ILF2" "INTS6" "IQSEC2" "IRF2BPL" "ITGB3"
## [139] "JARID2" "KAT2B" "KAT6A" "KATNAL2" "KCNJ10" "KCNQ2"
## [145] "KCNQ3" "KDM5B" "KDM5C" "KDM6A" "KDM6B" "KIAA1586"
## [151] "KIF5C" "KIRREL3" "KMT2A" "KCNS3" "KDM4C" "KMT2C"
## [157] "KMT2E" "LAMB1" "LEO1" "LZTR1" "MACROD2" "MAGEL2"
## [163] "MBD5" "MBOAT7" "MECP2" "MED13" "MED13L" "MEF2C"
## [169] "MET" "MTOR" "MYO5A" "MYO9B" "MYT1L" "NAA15"
## [175] "NACC1" "NAV2" "NBEA" "NCKAP1" "NCOR1" "NINL"
## [181] "NLGN1" "NLGN3" "NLGN4X" "NR2F1" "NR3C2" "NRXN1"
## [187] "NRXN3" "MYH10" "NFE2L3" "NFIB" "NTNG1" "NTRK2"
## [193] "NUAK1" "OPHN1" "OTUD7A" "OXTR" "P2RX5" "P4HA2"
## [199] "PAH" "PARD3B" "NUDCD2" "PAK2" "PCCA" "PER2"
## [205] "PHF2" "PHF3" "PHIP" "PHRF1" "PLCB1" "PLXNA4"
## [211] "PLXNB1" "POGZ" "PPM1D" "PPP2R5D" "PREX1" "PRICKLE1"
## [217] "PRICKLE2" "PRKCB" "PRODH" "PRR12" "PSMD12" "PTCHD1"
## [223] "PTEN" "PTK7" "PTPN11" "RAB2A" "RAB43" "RAI1"
## [229] "PHF21A" "RANBP17" "RBFOX1" "RBM27" "RELN" "PHB"
## [235] "PON1" "PPP2CA" "RALA" "RALGAPB" "RERE" "RHEB"
## [241] "RIMS1" "RNF135" "ROBO2" "RPS6KA3" "SAE1" "SATB2"
## [247] "SBF1" "SCN1A" "RAD21" "SCN2A" "SCN8A" "SCN9A"
## [253] "SEMA5A" "SETBP1" "SETD1B" "SETD2" "SETD5" "SHANK1"
## [259] "SHANK2" "SHANK3" "SIN3A" "SLC12A5" "SET" "SLC35B1"
## [265] "SLC38A10" "SLC6A1" "SLC7A3" "SLC7A5" "SMAD4" "SMARCA4"
## [271] "SMARCC2" "SMC1A" "SMC3" "SPARCL1" "SPAST" "SRCAP"
## [277] "SRSF11" "STXBP1" "STXBP5" "SYNE1" "SYNGAP1" "TAF6"
## [283] "TANC2" "TAOK2" "TBC1D23" "TBC1D31" "TBL1XR1" "SLITRK5"
## [289] "SNX5" "SPEN" "ST8SIA2" "TBR1" "TCF20" "TCF4"
## [295] "TCF7L2" "TERF2" "TET2" "TLK2" "TMLHE" "TNRC6B"
## [301] "TRAPPC9" "TRIO" "TRIP12" "TRPC6" "TSC2" "TTN"
## [307] "UBE3A" "UBE3C" "UBN2" "UBR5" "UNC13A" "UNC79"
## [313] "UPF3B" "TRAF7" "TRRAP" "USP15" "USP45" "USP7"
## [319] "WAC" "WDFY3" "WWOX" "ZBTB20" "ZC3H4" "ZMYND11"
## [325] "ZNF462" "WASF1" "WDFY4" "ZNF804A"
# # add a random line or reference
# SFARI_enrichment_table = NULL
for (cell_type in colnames(res_CARseq$p)) {
# Loading stats:
pval = -log10(res_CARseq$p[, cell_type])
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
# str(stats)
length(stats)
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)
stats = na.omit(stats)
cat("############################################################\n")
cat(sprintf("CARseq %s:\n", cell_type))
cat("############################################################\n")
suppressWarnings(print(fgseaMultilevel(list(SFARI=genes_in_SFARI), stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)))
}## ############################################################
## CARseq Astro:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.051 0.051 0.29 0.075 1.6 328
## leadingEdge
## 1: OTUD7A,MYO9B,BCKDK,CYFIP1,LZTR1,PHRF1,...
## ############################################################
## CARseq Exc:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.00033 0.00033 0.5 0.11 2.4 328
## leadingEdge
## 1: KCNQ2,KIAA1586,USP15,DPP10,GPHN,GRIN1,...
## ############################################################
## CARseq Inh:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 5.8e-07 5.8e-07 0.66 0.15 3.2 328
## leadingEdge
## 1: USP15,ZC3H4,KIAA1586,NLGN1,RAD21,KCNQ2,...
## ############################################################
## CARseq Micro:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 2.9e-07 2.9e-07 0.67 -0.16 -3.2 328
## leadingEdge
## 1: GRIN2B,CUX1,MAGEL2,SHANK3,PLXNA4,MBD5,...
## ############################################################
## CARseq Oligo:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.0036 0.0036 0.43 0.098 2 328
## leadingEdge
## 1: CAPRIN1,WAC,CDC42BPB,CUL3,BCKDK,FOXP2,...
## ############################################################
## CARseq OPC:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 1.9e-05 1.9e-05 0.58 0.14 2.8 328
## leadingEdge
## 1: MYO9B,SCN9A,PARD3B,PRR12,IRF2BPL,DIP2A,...
# add a random line or reference
for (cell_type in colnames(res_TOAST$p)) {
# Loading stats:
pval = -log10(res_TOAST$p[, cell_type])
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
# str(stats)
length(stats)
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)
stats = na.omit(stats)
cat("############################################################\n")
cat(sprintf("TOAST %s:\n", cell_type))
cat("############################################################\n")
suppressWarnings(print(fgseaMultilevel(list(SFARI=genes_in_SFARI), stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)))
}## ############################################################
## TOAST Astro:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.12 0.12 0.18 -0.065 -1.4 328
## leadingEdge
## 1: UBE3C,NUAK1,GRIP1,KMT2A,ASXL3,CIC,...
## ############################################################
## TOAST Exc:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.89 0.89 0.051 0.032 0.66 328
## leadingEdge
## 1: SHANK2,KDM5C,SBF1,PER2,RPS6KA3,PRICKLE2,...
## ############################################################
## TOAST Inh:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.14 0.14 0.16 -0.064 -1.3 328
## leadingEdge
## 1: BTAF1,SHANK1,CUL7,CACNA1E,P2RX5,SCN9A,...
## ############################################################
## TOAST Micro:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.00041 0.00041 0.5 -0.11 -2.3 328
## leadingEdge
## 1: CACNA2D3,ETFB,PAH,SETD2,TNRC6B,BCKDK,...
## ############################################################
## TOAST Oligo:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.36 0.36 0.096 -0.05 -1.1 328
## leadingEdge
## 1: MYH10,NINL,CUL3,ANXA1,ANK2,RNF135,...
## ############################################################
## TOAST OPC:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.55 0.55 0.075 -0.043 -0.91 328
## leadingEdge
## 1: MAGEL2,PRKCB,TRIP12,GABBR2,NLGN4X,PSMD12,...
# add a random line or reference
SFARI_enrichment_table = NULL
# Loading stats:
pval = -log10(res_DESeq2$p[, "pvalue"])
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
# str(stats)
length(stats)## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
## ############################################################
## DESeq2 Bulk:
## ############################################################
## pathway pval padj log2err ES NES size
## 1: SFARI 0.66 0.66 0.065 -0.041 -0.86 328
## leadingEdge
## 1: SHANK3,GRIK2,AGO4,RHEB,ADCY3,SHANK2,...
pval_dir = file.path(autism_working_dir, "results/_pvalues")
if (!dir.exists(pval_dir)) dir.create(pval_dir)
for (cell_type in colnames(res_CARseq$p)) {
pvalues = data.frame(gene_id=rowData(rse_filtered)$gene_id,
gene_name=rowData(rse_filtered)$gene_name,
pvalue=res_CARseq$p[, cell_type])
write_delim(pvalues, path = file.path(pval_dir, sprintf("ASD_CARseq_%s.txt", cell_type)))
}
for (cell_type in colnames(res_TOAST$p)) {
pvalues = data.frame(gene_id=rowData(rse_filtered)$gene_id,
gene_name=rowData(rse_filtered)$gene_name,
pvalue=res_TOAST$p[, cell_type])
write_delim(pvalues, path = file.path(pval_dir, sprintf("ASD_TOAST_%s.txt", cell_type)))
}
pvalues = data.frame(gene_id=rowData(rse_filtered)$gene_id,
gene_name=rowData(rse_filtered)$gene_name,
pvalue=res_DESeq2$p[, "pvalue"])
write_delim(pvalues, path = file.path(pval_dir, "ASD_DESeq2_bulk.txt"))Figure: Top 3 CARseq cell type-specific REACTOME pathways ranked by -log10 q value with the sign of NES in the ASD study.
topPathways_table = NULL
for (cell_type in colnames(res_CARseq$p)) {
CARseq_filename = file.path(cache_dir, sprintf("Autism_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("Autism_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
CARseq_fgseaResPositive = CARseq_fgseaRes[CARseq_fgseaRes$NES > 0, ] # genes that are enriched in small p-values
CARseq_topPathways = CARseq_fgseaResPositive[order(CARseq_fgseaResPositive[,"padj"], -CARseq_fgseaResPositive[,"NES"]), ]$pathway
topPathways_table = rbind(topPathways_table,
data.frame(pathway=CARseq_topPathways,
cell_type=cell_type,
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(CARseq_fgseaRes$pval)
[match(CARseq_topPathways, CARseq_fgseaRes$pathway)]),
NES = CARseq_fgseaRes[,"NES"]
[match(CARseq_topPathways, CARseq_fgseaRes$pathway)],
CT_specific_rank=seq_along(CARseq_topPathways)))
}
topPathways_table$cell_type = factor(as.character(topPathways_table$cell_type), levels = colnames(res_CARseq$p))
# deduplicate to remove the same pathway with a less significant q value
topPathways_table = topPathways_table[order(topPathways_table$CT_specific_rank, -topPathways_table$log10qvalue, -topPathways_table$NES), ]
topPathways_table = topPathways_table[!duplicated(topPathways_table$pathway), ]
# remove pathways that are not in the top N of each cell type
N = 3
topN_Pathways_table = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
cell_type_pathway_table = topPathways_table[topPathways_table$cell_type == cell_type, ]
topN_Pathways_table = rbind(topN_Pathways_table, cell_type_pathway_table[1:N, ])
}
# create a matrix for log10 q values of GSEA results of REACTOME
# The first 6 columns are CARseq, and the next 6 columns are TOAST, and the last column is DESeq2
pathway_log10qvalue_matrix = matrix(NA, nrow = nrow(topN_Pathways_table), ncol = ncol(res_CARseq$p) * 2 + 1)
rownames(pathway_log10qvalue_matrix) = as.character(topN_Pathways_table$pathway)
colnames(pathway_log10qvalue_matrix) = c(paste("CARseq", colnames(res_CARseq$p)), paste("TOAST", colnames(res_CARseq$p)), "DESeq2 bulk")
# fill in the blanks for CARseq, TOAST, and DESeq2 log 10 q values
DESeq2_filename = file.path(cache_dir, "Autism_DESeq2_GSEAMultilevel_REACTOME.rds")
DESeq2_fgseaRes = readRDS(DESeq2_filename)
indices_CARseq = match(rownames(pathway_log10qvalue_matrix), CARseq_fgseaRes$pathway)
indices_TOAST = match(rownames(pathway_log10qvalue_matrix), TOAST_fgseaRes$pathway)
indices_DESeq2 = match(rownames(pathway_log10qvalue_matrix), DESeq2_fgseaRes$pathway)
pathway_log10qvalue_matrix[, 1 + 2 * ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(DESeq2_fgseaRes$pval)) *
sign(DESeq2_fgseaRes$NES))[indices_DESeq2]
for (j in seq_len(ncol(res_CARseq$p))) {
cell_type = colnames(res_CARseq$p)[j]
CARseq_filename = file.path(cache_dir, sprintf("Autism_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("Autism_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
pathway_log10qvalue_matrix[, j] =
(-log10(CARseq:::get_qvalues_one_inflated(CARseq_fgseaRes$pval)) *
sign(CARseq_fgseaRes$NES))[indices_CARseq]
pathway_log10qvalue_matrix[, j + ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(TOAST_fgseaRes$pval)) *
sign(TOAST_fgseaRes$NES))[indices_TOAST]
}
write.csv(pathway_log10qvalue_matrix, file=file.path(cache_dir, "Autism_gsea_CARseq_vs_TOAST_REACTOME_ranked_by_CARseq_heatmap.csv"), quote=FALSE)
# heatmap
rownames(pathway_log10qvalue_matrix) = str_wrap(sub("REACTOME", "", gsub("_", " ", rownames(pathway_log10qvalue_matrix))), 75)
ann_colors = list(TopPathwaysByCT = as.character(wes_palette(name="Darjeeling2", 6, type="continuous")),
Method = as.character(wes_palette("FantasticFox1", 3)),
CellType = c(as.character(wes_palette(name="Darjeeling2", 6, type="continuous")), "grey"))
names(ann_colors[["TopPathwaysByCT"]]) = colnames(res_CARseq$p)
names(ann_colors[["Method"]]) = c("CARseq", "TOAST", "DESeq2")
names(ann_colors[["CellType"]]) = c(colnames(res_CARseq$p), "bulk")
annotation_row = data.frame(TopPathwaysByCT=factor(rep(colnames(res_CARseq$p), each=N)))
rownames(annotation_row) = rownames(pathway_log10qvalue_matrix)
annotation_col = data.frame(Method=factor(c(rep("CARseq", ncol(res_CARseq$p)), rep("TOAST", ncol(res_CARseq$p)), "DESeq2"),
levels=c("CARseq", "TOAST", "DESeq2")),
CellType=factor(c(rep(colnames(res_CARseq$p), times=2), "bulk")))
rownames(annotation_col) = colnames(pathway_log10qvalue_matrix)
g_asd_4 = pheatmap(pathway_log10qvalue_matrix,
cluster_rows = F, cluster_cols = F,
border_color ="grey60", na_col = "white", angle_col = "90",
colorRampPalette(c("darkblue", "royalblue", "grey95", "grey95", "pink1", "darkred"))(8),
breaks = seq(-4, 4, by=1), legend_breaks = c(seq(-3, 3, by=1), 4),
legend_labels = c(seq(-3, 3, by=1), "signed\n-log10(q)"),
gaps_row = seq(N, N*ncol(res_CARseq$p), by = N),
gaps_col = c(ncol(res_CARseq$p), ncol(res_CARseq$p)*2),
annotation_row = annotation_row, annotation_col=annotation_col,
annotation_colors = ann_colors, fontsize_row = 8,
cellwidth = 12, cellheight = 10)
g_asd_4Figure: Top 3 TOAST cell type-specific REACTOME pathways ranked by -log10 q value with the sign of NES in the ASD study.
topPathways_table = NULL
for (cell_type in colnames(res_CARseq$p)) {
CARseq_filename = file.path(cache_dir, sprintf("Autism_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("Autism_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
TOAST_fgseaResPositive = TOAST_fgseaRes[TOAST_fgseaRes$NES > 0, ] # genes that are enriched in small p-values
TOAST_topPathways = TOAST_fgseaResPositive[order(TOAST_fgseaResPositive[,"padj"], -TOAST_fgseaResPositive[,"NES"]), ]$pathway
topPathways_table = rbind(topPathways_table,
data.frame(pathway=TOAST_topPathways,
cell_type=cell_type,
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(TOAST_fgseaRes$pval)
[match(TOAST_topPathways, TOAST_fgseaRes$pathway)]),
NES = TOAST_fgseaRes[,"NES"]
[match(TOAST_topPathways, TOAST_fgseaRes$pathway)],
CT_specific_rank=seq_along(TOAST_topPathways)))
}
topPathways_table$cell_type = factor(as.character(topPathways_table$cell_type), levels = colnames(res_TOAST$p))
# deduplicate to remove the same pathway with a less significant q value
topPathways_table = topPathways_table[order(topPathways_table$CT_specific_rank, -topPathways_table$log10qvalue, -topPathways_table$NES), ]
topPathways_table = topPathways_table[!duplicated(topPathways_table$pathway), ]
# remove pathways that are not in the top N of each cell type
N = 3
topN_Pathways_table = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
cell_type_pathway_table = topPathways_table[topPathways_table$cell_type == cell_type, ]
topN_Pathways_table = rbind(topN_Pathways_table, cell_type_pathway_table[1:N, ])
}
# create a matrix for log10 q values of GSEA results of REACTOME
# The first 6 columns are CARseq, and the next 6 columns are TOAST, and the last column is DESeq2
pathway_log10qvalue_matrix = matrix(NA, nrow = nrow(topN_Pathways_table), ncol = ncol(res_CARseq$p) * 2 + 1)
rownames(pathway_log10qvalue_matrix) = as.character(topN_Pathways_table$pathway)
colnames(pathway_log10qvalue_matrix) = c(paste("CARseq", colnames(res_CARseq$p)), paste("TOAST", colnames(res_CARseq$p)), "DESeq2 bulk")
# fill in the blanks for CARseq, TOAST, and DESeq2 log 10 q values
DESeq2_filename = file.path(cache_dir, "Autism_DESeq2_GSEAMultilevel_REACTOME.rds")
DESeq2_fgseaRes = readRDS(DESeq2_filename)
indices_CARseq = match(rownames(pathway_log10qvalue_matrix), CARseq_fgseaRes$pathway)
indices_TOAST = match(rownames(pathway_log10qvalue_matrix), TOAST_fgseaRes$pathway)
indices_DESeq2 = match(rownames(pathway_log10qvalue_matrix), DESeq2_fgseaRes$pathway)
pathway_log10qvalue_matrix[, 1 + 2 * ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(DESeq2_fgseaRes$pval)) *
sign(DESeq2_fgseaRes$NES))[indices_DESeq2]
for (j in seq_len(ncol(res_CARseq$p))) {
cell_type = colnames(res_CARseq$p)[j]
CARseq_filename = file.path(cache_dir, sprintf("Autism_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("Autism_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
pathway_log10qvalue_matrix[, j] =
(-log10(CARseq:::get_qvalues_one_inflated(CARseq_fgseaRes$pval)) *
sign(CARseq_fgseaRes$NES))[indices_CARseq]
pathway_log10qvalue_matrix[, j + ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(TOAST_fgseaRes$pval)) *
sign(TOAST_fgseaRes$NES))[indices_TOAST]
}
write.csv(pathway_log10qvalue_matrix, file=file.path(cache_dir, "Autism_gsea_CARseq_vs_TOAST_REACTOME_ranked_by_TOAST_heatmap.csv"), quote=FALSE)
# heatmap
rownames(pathway_log10qvalue_matrix) = str_wrap(sub("REACTOME", "", gsub("_", " ", rownames(pathway_log10qvalue_matrix))), 75)
ann_colors = list(TopPathwaysByCT = as.character(wes_palette(name="Darjeeling2", 6, type="continuous")),
Method = as.character(wes_palette("FantasticFox1", 3)),
CellType = c(as.character(wes_palette(name="Darjeeling2", 6, type="continuous")), "grey"))
names(ann_colors[["TopPathwaysByCT"]]) = colnames(res_CARseq$p)
names(ann_colors[["Method"]]) = c("CARseq", "TOAST", "DESeq2")
names(ann_colors[["CellType"]]) = c(colnames(res_CARseq$p), "bulk")
annotation_row = data.frame(TopPathwaysByCT=factor(rep(colnames(res_CARseq$p), each=N)))
rownames(annotation_row) = rownames(pathway_log10qvalue_matrix)
annotation_col = data.frame(Method=factor(c(rep("CARseq", ncol(res_CARseq$p)), rep("TOAST", ncol(res_CARseq$p)), "DESeq2"),
levels=c("CARseq", "TOAST", "DESeq2")),
CellType=factor(c(rep(colnames(res_CARseq$p), times=2), "bulk")))
rownames(annotation_col) = colnames(pathway_log10qvalue_matrix)
pheatmap(pathway_log10qvalue_matrix,
cluster_rows = F, cluster_cols = F,
border_color ="grey60", na_col = "white", angle_col = "90",
colorRampPalette(c("darkblue", "royalblue", "grey95", "grey95", "pink1", "darkred"))(8),
breaks = seq(-4, 4, by=1), legend_breaks = c(seq(-3, 3, by=1), 4),
legend_labels = c(seq(-3, 3, by=1), "signed\n-log10(q)"),
gaps_row = seq(N, N*ncol(res_CARseq$p), by = N),
gaps_col = c(ncol(res_CARseq$p), ncol(res_CARseq$p)*2),
annotation_row = annotation_row, annotation_col=annotation_col,
annotation_colors = ann_colors, fontsize_row = 8,
cellwidth = 12, cellheight = 15)We use “classic” weight where no weight is added. The genes are ranked by p-values. The gene matching is done in gene symbol. There is a lot of genes with p-values being 1, which “may or may not cause trouble”, so we only care about the genes that are enriched in small p-values:
# Running fgsea:
pval = -log10(res_CARseq$p[, "Astro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.901 0.178 0.128 0 0.744 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_BP_Astro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Exc"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.219 0.759 0.59 0 0.139 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_BP_Exc.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Inh"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 1.533 1.104 0.274 0.165 0 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_BP_Inh.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Oligo"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.445 0.326 0 0.373 0 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_BP_Oligo.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Micro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0.155 0 0.112 0.812 0.204 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_BP_Micro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "OPC"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:19604] 0 0 0 0 0.146 ...
## - attr(*, "names")= chr [1:19604] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 19604
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 19547
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "Autism_CARseq_GSEAMultilevel_BP_OPC.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.scRNAseq_sig = readRDS(file.path(scz_working_dir, "./MTG/all_genes_MTG.rds"))
scRNAseq_sig$sig_matched = scRNAseq_sig$SIG[match(rowData(rse_filtered)$gene_name, rownames(scRNAseq_sig$SIG)), ]
colnames(scRNAseq_sig$sig_matched) = paste("scRNAseq", colnames(scRNAseq_sig$sig_matched)) # expression from MTG scRNAseq
dim(scRNAseq_sig$sig_matched)## [1] 19604 6
Plot a heatmap of correlation matrix
# Spearman correlation
# With infinite LFCs
res_CARseq_coefficients = res_CARseq$coefficients[, grep("Control|ASD", colnames(res_CARseq$coefficients))]
cor(res_CARseq_coefficients, method="spearman", use="pairwise.complete.obs")[1:6,7:12]## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## Control:Astro 0.69 0.191 0.0014 0.175 0.387 0.114
## Control:Exc 0.30 0.905 0.6794 0.094 0.480 0.039
## Control:Inh 0.16 0.489 0.3450 -0.098 0.146 0.309
## Control:Micro 0.25 -0.077 -0.0784 0.281 0.038 0.026
## Control:Oligo 0.42 0.536 0.3319 0.165 0.750 0.206
## Control:OPC 0.36 0.234 -0.0789 0.097 0.172 0.392
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.46 0.52 0.138 0.048
## scRNAseq Exc 0.11 0.82 0.312 -0.096
## scRNAseq Inh 0.16 0.77 0.340 -0.066
## scRNAseq Micro 0.23 0.43 0.078 0.171
## scRNAseq Oligo 0.25 0.60 0.146 0.028
## scRNAseq OPC 0.28 0.62 0.202 0.018
## Control:Oligo Control:OPC
## scRNAseq Astro 0.45 0.124
## scRNAseq Exc 0.40 0.080
## scRNAseq Inh 0.43 0.085
## scRNAseq Micro 0.39 0.094
## scRNAseq Oligo 0.66 0.111
## scRNAseq OPC 0.47 0.122
## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## scRNAseq Astro 0.54 0.49 0.39 0.0660 0.53 0.0421
## scRNAseq Exc 0.23 0.80 0.61 -0.0262 0.40 -0.0226
## scRNAseq Inh 0.26 0.74 0.64 0.0029 0.44 -0.0066
## scRNAseq Micro 0.29 0.39 0.33 0.2642 0.44 -0.0074
## scRNAseq Oligo 0.34 0.56 0.46 0.0750 0.71 0.0192
## scRNAseq OPC 0.35 0.57 0.52 0.0666 0.52 0.0526
# With shrunken LFCs -- note that the shrunken LFCs are not very good
res_CARseq_shrunken_coefficients = res_CARseq$shrunken_coefficients[, grep("Control|ASD", colnames(res_CARseq$shrunken_coefficients))]
res_CARseq_shrunken_coefficients[] = c(res_CARseq_shrunken_coefficients) + c(res_CARseq$shrunken_coefficients[, grep("intercept", colnames(res_CARseq$shrunken_coefficients))])
cor(res_CARseq_shrunken_coefficients, method="spearman", use="pairwise.complete.obs")[1:6,7:12]## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## Control:Astro 0.97 0.35 0.196 0.251 0.49 0.34
## Control:Exc 0.35 0.99 0.704 0.176 0.55 0.29
## Control:Inh 0.22 0.67 0.965 0.017 0.42 0.24
## Control:Micro 0.22 0.17 0.026 0.988 0.22 0.10
## Control:Oligo 0.47 0.54 0.449 0.244 0.98 0.29
## Control:OPC 0.39 0.33 0.150 0.106 0.25 0.99
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.57 0.52 0.37 0.121
## scRNAseq Exc 0.23 0.83 0.59 0.029
## scRNAseq Inh 0.27 0.77 0.62 0.061
## scRNAseq Micro 0.31 0.42 0.28 0.306
## scRNAseq Oligo 0.36 0.60 0.42 0.136
## scRNAseq OPC 0.38 0.61 0.48 0.124
## Control:Oligo Control:OPC
## scRNAseq Astro 0.54 0.20
## scRNAseq Exc 0.43 0.15
## scRNAseq Inh 0.47 0.17
## scRNAseq Micro 0.45 0.12
## scRNAseq Oligo 0.73 0.19
## scRNAseq OPC 0.55 0.22
## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## scRNAseq Astro 0.56 0.50 0.42 0.134 0.56 0.20
## scRNAseq Exc 0.24 0.81 0.65 0.044 0.45 0.15
## scRNAseq Inh 0.27 0.75 0.68 0.076 0.49 0.17
## scRNAseq Micro 0.30 0.40 0.34 0.317 0.47 0.12
## scRNAseq Oligo 0.35 0.57 0.49 0.149 0.75 0.19
## scRNAseq OPC 0.36 0.59 0.55 0.137 0.56 0.22
# With infinite LFCs as NAs
res_CARseq_coefficients_as_NAs = res_CARseq$coefficients[, grep("Control|ASD", colnames(res_CARseq$coefficients))]
res_CARseq_coefficients_as_NAs[!is.finite(res_CARseq_coefficients_as_NAs)] = NA
cor(res_CARseq_coefficients_as_NAs, method="spearman", use="pairwise.complete.obs")[1:6,7:12]## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## Control:Astro 0.86 0.69 0.63 0.63 0.71 0.65
## Control:Exc 0.65 0.95 0.88 0.54 0.76 0.68
## Control:Inh 0.65 0.80 0.73 0.52 0.64 0.75
## Control:Micro 0.65 0.60 0.57 0.64 0.58 0.55
## Control:Oligo 0.69 0.77 0.70 0.57 0.90 0.69
## Control:OPC 0.71 0.73 0.57 0.54 0.62 0.78
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.59 0.55 0.33 0.33
## scRNAseq Exc 0.42 0.81 0.53 0.31
## scRNAseq Inh 0.45 0.77 0.54 0.33
## scRNAseq Micro 0.36 0.44 0.23 0.38
## scRNAseq Oligo 0.44 0.61 0.36 0.32
## scRNAseq OPC 0.48 0.63 0.40 0.34
## Control:Oligo Control:OPC
## scRNAseq Astro 0.48 0.36
## scRNAseq Exc 0.51 0.42
## scRNAseq Inh 0.53 0.42
## scRNAseq Micro 0.39 0.26
## scRNAseq Oligo 0.67 0.38
## scRNAseq OPC 0.52 0.39
## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## scRNAseq Astro 0.58 0.52 0.45 0.31 0.52 0.37
## scRNAseq Exc 0.42 0.78 0.68 0.26 0.52 0.42
## scRNAseq Inh 0.44 0.74 0.70 0.28 0.54 0.43
## scRNAseq Micro 0.34 0.43 0.36 0.42 0.43 0.24
## scRNAseq Oligo 0.43 0.57 0.51 0.29 0.71 0.37
## scRNAseq OPC 0.46 0.59 0.57 0.30 0.55 0.40
# With infinite LFCs as NAs & Pearson correlation
cor(res_CARseq_coefficients_as_NAs, method="pearson", use="pairwise.complete.obs")[1:6,7:12]## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## Control:Astro 0.79 0.62 0.59 0.63 0.67 0.62
## Control:Exc 0.59 0.92 0.84 0.52 0.73 0.65
## Control:Inh 0.60 0.72 0.70 0.52 0.61 0.74
## Control:Micro 0.58 0.58 0.52 0.62 0.56 0.56
## Control:Oligo 0.64 0.71 0.65 0.57 0.86 0.63
## Control:OPC 0.66 0.63 0.53 0.52 0.59 0.72
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.59 0.53 0.31 0.30
## scRNAseq Exc 0.40 0.79 0.50 0.30
## scRNAseq Inh 0.44 0.75 0.51 0.31
## scRNAseq Micro 0.33 0.40 0.19 0.35
## scRNAseq Oligo 0.42 0.59 0.33 0.30
## scRNAseq OPC 0.47 0.61 0.38 0.32
## Control:Oligo Control:OPC
## scRNAseq Astro 0.47 0.33
## scRNAseq Exc 0.49 0.38
## scRNAseq Inh 0.51 0.39
## scRNAseq Micro 0.35 0.21
## scRNAseq Oligo 0.65 0.34
## scRNAseq OPC 0.51 0.35
## ASD:Astro ASD:Exc ASD:Inh ASD:Micro ASD:Oligo ASD:OPC
## scRNAseq Astro 0.57 0.49 0.43 0.31 0.51 0.36
## scRNAseq Exc 0.40 0.75 0.65 0.25 0.50 0.40
## scRNAseq Inh 0.42 0.71 0.67 0.27 0.52 0.41
## scRNAseq Micro 0.31 0.39 0.33 0.40 0.40 0.19
## scRNAseq Oligo 0.42 0.54 0.49 0.29 0.70 0.35
## scRNAseq OPC 0.45 0.56 0.54 0.29 0.54 0.39
# plot
# library(pheatmap)
cor_all = cor(cbind(res_CARseq_coefficients_as_NAs, log(1 + scRNAseq_sig$sig_matched)), method="pearson", use="pairwise.complete.obs")
# cor_all[lower.tri(cor_all, diag=TRUE)] = NA
cor_all = cor_all[1:12, 7:18]
cor_all[7:12, 1:6] = NA
cor_all_as_character = matrix(sprintf("%.2f", cor_all), nrow=nrow(cor_all))
cor_all_as_character[cor_all_as_character == "NA"] = ""
pheatmap(cor_all, cluster_rows = F, cluster_cols = F, display_numbers = cor_all_as_character, border_color = NA, na_col = "white", angle_col = "90", colorRampPalette(c("blue", "royalblue", "white", "pink", "red"))(21), breaks = seq(-1, 1, by=0.1), gaps_row = 6, gaps_col = 6)cellular_proportions = readRDS(file.path(autism_working_dir, "data/ASD_prop.rds"))
cellular_proportions_longtable =
cellular_proportions$ICeDT[colnames(rse_filtered), ] %>% as.data.frame() %>%
rownames_to_column(var = "samples") %>%
add_column(group = colData(rse_filtered)$Diagnosis) %>%
add_column(Exc_prop = cellular_proportions$ICeDT[,"Exc"]) %>%
pivot_longer(cols = Astro:OPC, names_to = "cell type", values_to = "cellular proportions")
# facet_wrap(~group) is not a good solution since the samples in different facets are assumed to be the same.
# Another idea is to sort the samples by SCZ or Control, and add a layer of gray to SCZ samples.
g_asd_1_ASD = ggplot(data=cellular_proportions_longtable %>% subset(group %in% "ASD"), aes(x=reorder(samples, Exc_prop), y=`cellular proportions`, fill=`cell type`)) +
geom_bar(stat = "identity", width=1) +
xlab("sample") +
ggtitle("ASD samples") +
scale_fill_manual(values = wes_palette(6, name="Darjeeling2", type="continuous")) +
theme_minimal() +
theme(panel.grid.major = element_blank(), axis.text.x = element_blank(), legend.position = "none")
g_asd_1_Control = ggplot(data=cellular_proportions_longtable %>% subset(group %in% "Control"), aes(x=reorder(samples, Exc_prop), y=`cellular proportions`, fill=`cell type`)) +
geom_bar(stat = "identity", width=1) +
xlab("sample") +
ggtitle("Control samples") +
scale_fill_manual(values = wes_palette(6, name="Darjeeling2", type="continuous")) +
theme_minimal() +
theme(panel.grid.major = element_blank(), axis.text.x = element_blank())
g_asd_1 = ggarrange(g_asd_1_ASD, g_asd_1_Control, ncol = 2, widths = c(4,5), nrow = 1)
g_asd_1# log ratio against Exc
summary_ICeDT = list()
summary_ICeDT[["Astro"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Astro"]) / (1e-2 + cellular_proportions$ICeDT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_ICeDT[["Astro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Astro"])/(0.01 +
## cellular_proportions$ICeDT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8418 -0.2306 0.0099 0.1843 1.4430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.885836 1.165291 0.76 0.450
## DiagnosisASD 0.216780 0.092129 2.35 0.021 *
## BrainBankNICHD -0.234046 0.116232 -2.01 0.048 *
## Sequencing.Batchbatch2 -0.361065 0.176360 -2.05 0.044 *
## Sequencing.Batchbatch3 0.171329 0.191946 0.89 0.375
## log_depth -0.322590 0.165765 -1.95 0.055 .
## AgeDeath 0.000336 0.002816 0.12 0.905
## RIN -0.040594 0.037810 -1.07 0.287
## seqSV1 0.048435 0.075278 0.64 0.522
## seqSV2 0.104737 0.053286 1.97 0.053 .
## seqSV3 0.028425 0.062374 0.46 0.650
## seqSV4 0.006035 0.048477 0.12 0.901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.41 on 73 degrees of freedom
## Multiple R-squared: 0.389, Adjusted R-squared: 0.297
## F-statistic: 4.22 on 11 and 73 DF, p-value: 7.77e-05
summary_ICeDT[["Inh"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Inh"]) / (1e-2 + cellular_proportions$ICeDT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_ICeDT[["Inh"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Inh"])/(0.01 +
## cellular_proportions$ICeDT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2058 -0.0815 0.0145 0.1182 0.9598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.81731 0.80053 -2.27 0.026 *
## DiagnosisASD -0.03733 0.06329 -0.59 0.557
## BrainBankNICHD 0.12850 0.07985 1.61 0.112
## Sequencing.Batchbatch2 0.21662 0.12116 1.79 0.078 .
## Sequencing.Batchbatch3 0.07965 0.13186 0.60 0.548
## log_depth 0.02513 0.11388 0.22 0.826
## AgeDeath -0.00511 0.00193 -2.64 0.010 *
## RIN 0.03470 0.02597 1.34 0.186
## seqSV1 -0.04471 0.05171 -0.86 0.390
## seqSV2 -0.03985 0.03661 -1.09 0.280
## seqSV3 0.07719 0.04285 1.80 0.076 .
## seqSV4 -0.02074 0.03330 -0.62 0.535
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.28 on 73 degrees of freedom
## Multiple R-squared: 0.28, Adjusted R-squared: 0.171
## F-statistic: 2.58 on 11 and 73 DF, p-value: 0.00797
summary_ICeDT[["Micro"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Micro"]) / (1e-2 + cellular_proportions$ICeDT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_ICeDT[["Micro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Micro"])/(0.01 +
## cellular_proportions$ICeDT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8375 -0.2304 -0.0616 0.1376 1.8853
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.07832 1.17948 -2.61 0.0110 *
## DiagnosisASD 0.10745 0.09325 1.15 0.2530
## BrainBankNICHD -0.19325 0.11765 -1.64 0.1048
## Sequencing.Batchbatch2 -0.17855 0.17851 -1.00 0.3205
## Sequencing.Batchbatch3 0.01030 0.19428 0.05 0.9579
## log_depth 0.03697 0.16778 0.22 0.8262
## AgeDeath -0.00758 0.00285 -2.66 0.0096 **
## RIN -0.00742 0.03827 -0.19 0.8468
## seqSV1 -0.08211 0.07619 -1.08 0.2848
## seqSV2 0.03790 0.05393 0.70 0.4845
## seqSV3 0.03752 0.06313 0.59 0.5542
## seqSV4 0.03048 0.04907 0.62 0.5365
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.41 on 73 degrees of freedom
## Multiple R-squared: 0.198, Adjusted R-squared: 0.0777
## F-statistic: 1.64 on 11 and 73 DF, p-value: 0.105
summary_ICeDT[["Oligo"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Oligo"]) / (1e-2 + cellular_proportions$ICeDT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_ICeDT[["Oligo"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Oligo"])/(0.01 +
## cellular_proportions$ICeDT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4593 -0.4110 -0.0195 0.3658 2.5181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35306 1.85413 0.19 0.8495
## DiagnosisASD -0.15983 0.14659 -1.09 0.2792
## BrainBankNICHD -0.40515 0.18494 -2.19 0.0317 *
## Sequencing.Batchbatch2 -0.70765 0.28061 -2.52 0.0139 *
## Sequencing.Batchbatch3 0.52261 0.30541 1.71 0.0913 .
## log_depth -0.37614 0.26375 -1.43 0.1581
## AgeDeath 0.00784 0.00448 1.75 0.0845 .
## RIN 0.02912 0.06016 0.48 0.6298
## seqSV1 0.08256 0.11978 0.69 0.4928
## seqSV2 0.01510 0.08478 0.18 0.8591
## seqSV3 0.27695 0.09924 2.79 0.0067 **
## seqSV4 -0.01597 0.07713 -0.21 0.8365
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.65 on 73 degrees of freedom
## Multiple R-squared: 0.266, Adjusted R-squared: 0.156
## F-statistic: 2.41 on 11 and 73 DF, p-value: 0.013
summary_ICeDT[["OPC"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"OPC"]) / (1e-2 + cellular_proportions$ICeDT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_ICeDT[["OPC"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "OPC"])/(0.01 +
## cellular_proportions$ICeDT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6070 -0.1785 -0.0259 0.1101 1.2698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.17580 0.91790 -0.19 0.849
## DiagnosisASD 0.12659 0.07257 1.74 0.085 .
## BrainBankNICHD -0.04969 0.09156 -0.54 0.589
## Sequencing.Batchbatch2 -0.34764 0.13892 -2.50 0.015 *
## Sequencing.Batchbatch3 0.05519 0.15120 0.37 0.716
## log_depth -0.18408 0.13057 -1.41 0.163
## AgeDeath -0.01044 0.00222 -4.71 1.2e-05 ***
## RIN -0.06350 0.02978 -2.13 0.036 *
## seqSV1 0.01580 0.05930 0.27 0.791
## seqSV2 0.08486 0.04197 2.02 0.047 *
## seqSV3 0.08848 0.04913 1.80 0.076 .
## seqSV4 -0.02462 0.03819 -0.64 0.521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.32 on 73 degrees of freedom
## Multiple R-squared: 0.496, Adjusted R-squared: 0.42
## F-statistic: 6.53 on 11 and 73 DF, p-value: 1.87e-07
# log ratio against Exc
summary_CIBERSORT = list()
summary_CIBERSORT[["Astro"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Astro"]) / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_CIBERSORT[["Astro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Astro"])/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7855 -0.4306 0.0076 0.4492 2.3009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.52391 2.22302 1.14 0.260
## DiagnosisASD 0.40558 0.17575 2.31 0.024 *
## BrainBankNICHD -0.53613 0.22173 -2.42 0.018 *
## Sequencing.Batchbatch2 -0.63585 0.33644 -1.89 0.063 .
## Sequencing.Batchbatch3 0.03933 0.36618 0.11 0.915
## log_depth -0.69978 0.31623 -2.21 0.030 *
## AgeDeath 0.00868 0.00537 1.62 0.110
## RIN -0.08695 0.07213 -1.21 0.232
## seqSV1 0.16946 0.14361 1.18 0.242
## seqSV2 0.10780 0.10165 1.06 0.292
## seqSV3 -0.11728 0.11899 -0.99 0.328
## seqSV4 0.08188 0.09248 0.89 0.379
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.78 on 73 degrees of freedom
## Multiple R-squared: 0.387, Adjusted R-squared: 0.295
## F-statistic: 4.19 on 11 and 73 DF, p-value: 8.47e-05
summary_CIBERSORT[["Inh"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Inh"]) / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_CIBERSORT[["Inh"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Inh"])/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.344 -0.223 -0.010 0.231 1.721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.97246 1.27752 -0.76 0.449
## DiagnosisASD 0.00450 0.10100 0.04 0.965
## BrainBankNICHD -0.04569 0.12743 -0.36 0.721
## Sequencing.Batchbatch2 -0.20649 0.19335 -1.07 0.289
## Sequencing.Batchbatch3 -0.15151 0.21043 -0.72 0.474
## log_depth -0.23212 0.18173 -1.28 0.206
## AgeDeath -0.00717 0.00309 -2.32 0.023 *
## RIN -0.00584 0.04145 -0.14 0.888
## seqSV1 0.05273 0.08253 0.64 0.525
## seqSV2 -0.13026 0.05842 -2.23 0.029 *
## seqSV3 0.00819 0.06838 0.12 0.905
## seqSV4 0.01950 0.05315 0.37 0.715
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.45 on 73 degrees of freedom
## Multiple R-squared: 0.138, Adjusted R-squared: 0.00852
## F-statistic: 1.07 on 11 and 73 DF, p-value: 0.401
summary_CIBERSORT[["Micro"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Micro"]) / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_CIBERSORT[["Micro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Micro"])/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6896 -0.2303 -0.0851 0.1141 1.7442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.67525 1.23311 -2.98 0.0039 **
## DiagnosisASD 0.10655 0.09749 1.09 0.2780
## BrainBankNICHD -0.29779 0.12300 -2.42 0.0180 *
## Sequencing.Batchbatch2 -0.27614 0.18662 -1.48 0.1433
## Sequencing.Batchbatch3 0.06966 0.20312 0.34 0.7326
## log_depth -0.06466 0.17541 -0.37 0.7135
## AgeDeath -0.00285 0.00298 -0.96 0.3418
## RIN 0.00913 0.04001 0.23 0.8202
## seqSV1 -0.04065 0.07966 -0.51 0.6114
## seqSV2 -0.00774 0.05639 -0.14 0.8912
## seqSV3 -0.03326 0.06600 -0.50 0.6158
## seqSV4 0.11210 0.05130 2.19 0.0321 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.43 on 73 degrees of freedom
## Multiple R-squared: 0.211, Adjusted R-squared: 0.0926
## F-statistic: 1.78 on 11 and 73 DF, p-value: 0.0734
summary_CIBERSORT[["Oligo"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Oligo"]) / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_CIBERSORT[["Oligo"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Oligo"])/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7053 -0.3827 -0.0595 0.4006 2.8164
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.60697 2.13305 0.28 0.7768
## DiagnosisASD -0.16519 0.16864 -0.98 0.3306
## BrainBankNICHD -0.59226 0.21276 -2.78 0.0068 **
## Sequencing.Batchbatch2 -0.82758 0.32282 -2.56 0.0124 *
## Sequencing.Batchbatch3 0.59272 0.35135 1.69 0.0959 .
## log_depth -0.43328 0.30343 -1.43 0.1576
## AgeDeath 0.00962 0.00515 1.87 0.0659 .
## RIN 0.02174 0.06921 0.31 0.7543
## seqSV1 0.13743 0.13780 1.00 0.3219
## seqSV2 -0.00602 0.09754 -0.06 0.9510
## seqSV3 0.20035 0.11417 1.75 0.0835 .
## seqSV4 0.02556 0.08874 0.29 0.7741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.75 on 73 degrees of freedom
## Multiple R-squared: 0.265, Adjusted R-squared: 0.154
## F-statistic: 2.39 on 11 and 73 DF, p-value: 0.0136
summary_CIBERSORT[["OPC"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"OPC"]) / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"]))~ Diagnosis+BrainBank+Sequencing.Batch+log_depth + AgeDeath + RIN + seqSV1 + seqSV2 + seqSV3 + seqSV4, data=colData(rse_filtered)))
summary_CIBERSORT[["OPC"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "OPC"])/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"])) ~ Diagnosis + BrainBank +
## Sequencing.Batch + log_depth + AgeDeath + RIN + seqSV1 +
## seqSV2 + seqSV3 + seqSV4, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3104 -0.1043 -0.0323 0.0474 1.1006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.352677 0.645411 -5.19 1.8e-06 ***
## DiagnosisASD 0.059984 0.051027 1.18 0.244
## BrainBankNICHD -0.149330 0.064376 -2.32 0.023 *
## Sequencing.Batchbatch2 -0.212905 0.097679 -2.18 0.033 *
## Sequencing.Batchbatch3 0.035506 0.106312 0.33 0.739
## log_depth -0.125273 0.091811 -1.36 0.177
## AgeDeath 0.000205 0.001560 0.13 0.896
## RIN -0.017074 0.020942 -0.82 0.418
## seqSV1 0.013984 0.041694 0.34 0.738
## seqSV2 -0.004279 0.029513 -0.14 0.885
## seqSV3 -0.021925 0.034547 -0.63 0.528
## seqSV4 0.034837 0.026849 1.30 0.199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.23 on 73 degrees of freedom
## Multiple R-squared: 0.233, Adjusted R-squared: 0.117
## F-statistic: 2.01 on 11 and 73 DF, p-value: 0.0391
# The power is low is probably because CIBERSORT gave 0 estimates:
table(cellular_proportions$CIBERSORT[, "Astro"] == 0)##
## FALSE TRUE
## 83 2
##
## FALSE TRUE
## 3 82
# See http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
# collect summary of cellular fraction linear models
summary_table = NULL
for (cell_type in names(summary_ICeDT)) {
summary_table = rbind(summary_table,
data.frame(method="ICeDT",
cell_type=cell_type,
diagnosis_effect=summary_ICeDT[[cell_type]]$coefficients[2, "Estimate"],
se=summary_ICeDT[[cell_type]]$coefficients[2, "Std. Error"],
pval=summary_ICeDT[[cell_type]]$coefficients[2, "Pr(>|t|)"])
)
summary_table = rbind(summary_table,
data.frame(method="CIBERSORT",
cell_type=cell_type,
diagnosis_effect=summary_CIBERSORT[[cell_type]]$coefficients[2, "Estimate"],
se=summary_CIBERSORT[[cell_type]]$coefficients[2, "Std. Error"],
pval=summary_CIBERSORT[[cell_type]]$coefficients[2, "Pr(>|t|)"])
)
}
summary_table$method = factor(summary_table$method, levels=c("ICeDT", "CIBERSORT"))
summary_table## method cell_type diagnosis_effect se pval
## 1 ICeDT Astro 0.2168 0.092 0.021
## 2 CIBERSORT Astro 0.4056 0.176 0.024
## 3 ICeDT Inh -0.0373 0.063 0.557
## 4 CIBERSORT Inh 0.0045 0.101 0.965
## 5 ICeDT Micro 0.1074 0.093 0.253
## 6 CIBERSORT Micro 0.1066 0.097 0.278
## 7 ICeDT Oligo -0.1598 0.147 0.279
## 8 CIBERSORT Oligo -0.1652 0.169 0.331
## 9 ICeDT OPC 0.1266 0.073 0.085
## 10 CIBERSORT OPC 0.0600 0.051 0.244
# The errorbars overlapped, so use position_dodge to move them horizontally
pd = position_dodge(0.3) # move them to the left and right
summary_table$cell_type = factor(summary_table$cell_type, levels = rev(levels(summary_table$cell_type)))
g_asd_2 = ggplot(summary_table, aes(x=cell_type, y=diagnosis_effect, colour=method, group=method)) +
geom_errorbar(aes(ymin=diagnosis_effect-se, ymax=diagnosis_effect+se), colour="black", width=.1, position=pd) +
# geom_line(position=pd) +
geom_point(position=pd, size=3, shape=21, fill="white") + # 21 is filled circle
xlab("Cell types") +
ylab("Difference of log ratio between case and control") +
scale_colour_hue(name="Deconvolution method", # Legend label, use darker colors
breaks=c("ICeDT", "CIBERSORT"),
labels=c("ICeDT", "CIBERSORT"),
l=40) + # Use darker colors, lightness=40
ggtitle(" Cellular fraction estimates\n on ASD vs. Control\n adjusted for covariates") +
expand_limits(y=0) + # Expand y range
theme_bw() +
geom_hline(yintercept=0, linetype="dashed")
g_asd_2volcano_plot_data = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(res_CARseq$p[, cell_type]))
log10qvalue_significant = which(log10qvalue > -log10(0.1))
if (length(log10qvalue_significant) > 0) {
volcano_plot_data = rbind(volcano_plot_data,
data.frame(cell_type = cell_type,
gene_name = rowData(rse_filtered)$gene_name[log10qvalue_significant],
log10qvalue = log10qvalue[log10qvalue_significant],
LFC = res_CARseq$lfc[log10qvalue_significant, grep(cell_type, colnames(res_CARseq$lfc))]
))
}
}
dim(volcano_plot_data)## [1] 1067 4
# remove the LFCs that are not finite
volcano_plot_data = volcano_plot_data[is.finite(volcano_plot_data$LFC), ]
dim(volcano_plot_data)## [1] 493 4
volcano_plot_data$gene_name_subset = as.character(volcano_plot_data$gene_name)
volcano_plot_data$gene_name_subset[rank(-volcano_plot_data$log10qvalue) > 10] = ""
range(volcano_plot_data$log10qvalue)## [1] 1.0 1.5
## [1] -5.4 37.4
ggplot(volcano_plot_data, aes(x=LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0)) +
geom_hline(yintercept=1, linetype="dashed") +
# geom_hline(yintercept=-log10(0.25), linetype="dotted") +
xlim(-max(abs(volcano_plot_data$LFC)), max(abs(volcano_plot_data$LFC))) +
geom_vline(xintercept=0) +
theme_minimal()volcano_plot_data = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(res_CARseq$p[, cell_type]))
# only showing gene-cell type pairs with q value < 0.1 & shrunken lfc > 0.01 in abs value
log10qvalue_significant = which(log10qvalue > -log10(0.1) & abs(res_CARseq$shrunken_lfc[, grep(cell_type, colnames(res_CARseq$shrunken_lfc))]) > 0.01)
if (length(log10qvalue_significant) > 0) {
volcano_plot_data = rbind(volcano_plot_data,
data.frame(cell_type = cell_type,
gene_name = rowData(rse_filtered)$gene_name[log10qvalue_significant],
pvalue = res_CARseq$p[log10qvalue_significant, cell_type],
log10qvalue = log10qvalue[log10qvalue_significant],
shrunken_LFC = res_CARseq$shrunken_lfc[log10qvalue_significant, grep(cell_type, colnames(res_CARseq$shrunken_lfc))]
))
}
}
dim(volcano_plot_data)## [1] 1048 5
# remove the LFCs that are not finite
volcano_plot_data = volcano_plot_data[is.finite(volcano_plot_data$shrunken_LFC), ]
dim(volcano_plot_data)## [1] 1048 5
# show gene names on the plot when q value < 0.01
volcano_plot_data$gene_name_subset = as.character(volcano_plot_data$gene_name)
# neurons
volcano_plot_data_neuronal = subset(volcano_plot_data, (cell_type %in% c("Exc","Inh")))
if (nrow(volcano_plot_data_neuronal) >= 10) {
volcano_plot_data_neuronal$gene_name_subset[order(order(-volcano_plot_data_neuronal$log10qvalue, volcano_plot_data_neuronal$pvalue)) > 10] = ""
g_asd_3 = ggplot(volcano_plot_data_neuronal, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", box.padding = 0.25, force = 20, size = 3, max.iter = 10000) +
geom_hline(yintercept=1, linetype="dashed") +
# geom_hline(yintercept=-log10(0.25), linetype="dotted") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) +
geom_vline(xintercept=0) +
scale_color_manual(values = wes_palette(6, name="Darjeeling2", type="continuous"), limits = colnames(res_CARseq$p)) +
theme_minimal()
g_asd_3
}# non neurons
volcano_plot_data_non_neuronal = subset(volcano_plot_data, !(cell_type %in% c("Exc","Inh")))
if (nrow(volcano_plot_data_non_neuronal) >= 10) {
volcano_plot_data_non_neuronal$gene_name_subset[order(order(-volcano_plot_data_non_neuronal$log10qvalue, volcano_plot_data_neuronal$pvalue)) > 10] = ""
ggplot(volcano_plot_data_non_neuronal, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0)) +
geom_hline(yintercept=1, linetype="dashed") +
geom_hline(yintercept=-log10(0.25), linetype="dotted") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) + ylim(0.5, max(volcano_plot_data$log10qvalue)) +
geom_vline(xintercept=0) +
scale_color_manual(values = wes_palette(6, name="Darjeeling2", type="continuous"), limits = colnames(res_CARseq$p)) +
theme_minimal()
}volcano_plot_data = data.frame()
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(res_DESeq2$p[, "pvalue"]))
res_DESeq2$p$shrunken_LFC = res_DESeq2$p$log2FoldChange * log(2) # transform log2 to natural log
# only showing gene-cell type pairs with q value < 0.1 & shrunken lfc > 0.1 in abs value
log10qvalue_significant = which(log10qvalue > -log10(0.1)) # & abs(res_DESeq2$p$shrunken_LFC) > 0.1)
if (length(log10qvalue_significant) > 0) {
volcano_plot_data = rbind(volcano_plot_data,
data.frame(cell_type = "DESeq2 bulk",
gene_name = rowData(rse_filtered)$gene_name[log10qvalue_significant],
log10qvalue = log10qvalue[log10qvalue_significant],
shrunken_LFC = res_DESeq2$p$shrunken_LFC[log10qvalue_significant]
))
}
dim(volcano_plot_data)## [1] 1063 4
# remove the LFCs that are not finite
volcano_plot_data = volcano_plot_data[is.finite(volcano_plot_data$shrunken_LFC), ]
dim(volcano_plot_data)## [1] 1063 4
# show gene names on the plot when q value < 0.01
volcano_plot_data$gene_name_subset = as.character(volcano_plot_data$gene_name)
volcano_plot_data$gene_name_subset[rank(-volcano_plot_data$log10qvalue) > 10] = ""
ggplot(volcano_plot_data, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0), box.padding = 0.25, force = 10, size = 4) +
geom_hline(yintercept=1, linetype="dashed") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) + ylim(0.5, max(volcano_plot_data$log10qvalue)) +
geom_vline(xintercept=0) +
theme_minimal()library(gplots)
autism_pvalues = data.frame(CARseq=res_CARseq$p, DESeq2.bulk=res_DESeq2$p$pvalue)
autism_qvalues = apply(autism_pvalues, 2, CARseq:::get_qvalues_one_inflated)
autism_significant = (autism_qvalues < 0.1)
autism_significant[is.na(autism_significant)] = FALSE
opar = par(cex = 0.7)
venn(as.data.frame(autism_significant[,
c("CARseq.Exc", "CARseq.Inh", "DESeq2.bulk")]))set.seed(1234)
scz_working_dir = "."
cache_dir = "reproducible_figures_cache"
if (!file.exists(cache_dir)) dir.create(cache_dir)
# load gene annotation:
rse_filtered = readRDS(file.path(scz_working_dir, "data/rse_filtered_SV.rds"))
############################################################
# 2 SVs
############################################################
# CARseq, ICeDT
# not permuted
res_CARseq = readRDS(file.path(scz_working_dir, "results/SCZ_CARseq_ICeDT_SV2.rds"))
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_CARseq$p))) {
hist(res_CARseq$p[,k], main=colnames(res_CARseq$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_CARseq$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}# permuted
res_CARseq_permuted = readRDS(file.path(scz_working_dir, "results/SCZ_CARseq_ICeDT_permuted_SV2.rds"))
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_CARseq_permuted$p))) {
hist(res_CARseq_permuted$p[,k], main=colnames(res_CARseq_permuted$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_CARseq_permuted$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}We use “classic” weight where no weight is added. The genes are ranked by p-values. The gene matching is done in gene symbol. There is a lot of genes with p-values being 1, which “may or may not cause trouble”, so we only care about the genes that are enriched in small p-values:
############################################################
# 2 SVs
############################################################
# CARseq, ICeDT
# not permuted
res_TOAST = list()
res_TOAST$p = read.table(file.path(scz_working_dir, "results/SCZ_step1_TOAST_with_covariates_SV2.txt"))
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_TOAST$p))) {
hist(res_TOAST$p[,k], main=colnames(res_TOAST$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_TOAST$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}# permuted
res_TOAST_permuted = list()
res_TOAST_permuted$p = read.table(file.path(scz_working_dir, "results/SCZ_step1_TOAST_with_covariates_SV2_permuted.txt"))
par(mfrow=c(2,3), bty="n", mar=c(5,4,2,1))
for(k in seq_len(ncol(res_TOAST_permuted$p))) {
hist(res_TOAST_permuted$p[,k], main=colnames(res_TOAST_permuted$p)[k], breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_TOAST_permuted$p[, k]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
}# Running fgsea:
pval = -log10(res_TOAST$p[, "Astro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.4482 0.1488 0.2394 0.9501 0.0757 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_TOAST_GSEAMultilevel_REACTOME_Astro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 2.9e-05 0.0021 0.58 0.12 2.6 360
## leadingEdge
## 1: CPLX1,KCNA1,LRRTM4,KCNA3,NLGN3,CACNA2D2,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Exc"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.2082 0.2299 0.042 0.0692 0.5065 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_TOAST_GSEAMultilevel_REACTOME_Exc.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.53 0.89 0.075 0.043 0.93 360
## leadingEdge
## 1: PRKACA,GNG3,RAB3A,CACNG3,CPLX1,KCNC3,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Inh"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.274 0.464 0.21 0.495 0.719 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_TOAST_GSEAMultilevel_REACTOME_Inh.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 4.8e-05 0.012 0.56 -0.12 -2.7 360
## leadingEdge
## 1: DBNL,SLITRK3,ARHGEF9,PDPK1,CHRNA7,ACHE,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Oligo"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.157 1.05 0.474 0.46 1.116 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_TOAST_GSEAMultilevel_REACTOME_Oligo.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.53 0.73 0.077 -0.044 -0.94 360
## leadingEdge
## 1: CHRNG,ADCY1,SYT7,MYO6,GRIN1,ADCY5,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "Micro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.941 0.454 1.277 0.44 0.143 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_TOAST_GSEAMultilevel_REACTOME_Micro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.021 0.087 0.35 -0.08 -1.7 360
## leadingEdge
## 1: EPB41L2,GABRG3,PRKAA2,ARL6IP5,KCNH7,KCNH3,...
# Running fgsea:
pval = -log10(res_TOAST$p[, "OPC"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_TOAST$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.296 0.173 0.205 1.041 0.392 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_TOAST_GSEAMultilevel_REACTOME_OPC.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.0012 0.045 0.46 -0.1 -2.2 360
## leadingEdge
## 1: GRM1,BEGAIN,CAMK2B,NEFL,KCNK10,PPM1F,...
The results are similar to what we see in Huckins et al. 2019, where we did not find enrichment in pathways about synapse or neuron functions among excitatory or inhibitory neurons.
We use “classic” weight where no weight is added. The genes are ranked by p-values. The gene matching is done in gene symbol. There is a lot of genes with p-values being 1, which “may or may not cause trouble”, so we only care about the genes that are enriched in small p-values:
# Running fgsea:
pval = -log10(res_CARseq$p[, "Astro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.8858 0.4086 0.6988 0.0764 0.297 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_REACTOME_Astro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 1.6e-36 1.8e-33 1.6 -0.34 -7.4 360
## leadingEdge
## 1: TUBB3,VAMP2,GRIN3A,LRRTM3,CALM1,CACNA1E,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Exc"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0 0 0.497 0.118 0.176 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_REACTOME_Exc.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.08 0.6 0.22 0.069 1.5 360
## leadingEdge
## 1: GNG4,KCNC3,PRKACA,KCNJ8,CPLX1,RASGRF2,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Inh"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0 0.06 0.232 0.122 0 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_REACTOME_Inh.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 6.3e-05 0.0027 0.54 0.12 2.6 360
## leadingEdge
## 1: PRKCB,KCNJ9,NRAS,KCNJ10,GNB3,RASGRF2,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Oligo"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.976 0.516 0 0.375 0 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_REACTOME_Oligo.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 2.6e-19 2.9e-16 1.1 -0.25 -5.4 360
## leadingEdge
## 1: VAMP2,DLGAP2,KCNMB2,SIPA1L1,SRC,IL1RAP,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "Micro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.496 0.429 2.033 0 0.484 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_REACTOME_Micro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 3.8e-29 4.2e-26 1.4 -0.3 -6.6 360
## leadingEdge
## 1: VAMP2,GABBR1,LRRTM3,UNC13B,CALM1,CACNA1E,...
# Running fgsea:
pval = -log10(res_CARseq$p[, "OPC"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0 0.314 1.854 0.638 0 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_REACTOME_OPC.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.00017 0.0021 0.52 0.11 2.5 360
## leadingEdge
## 1: APBA2,IL1RAP,APBA3,KCNMB4,TUBB4B,LRTOMT,...
# Running fgsea:
res_DESeq2 = list()
res_DESeq2$p = read.table(file.path(scz_working_dir, "results/SCZ_step1_DESeq2_bulk_adj_covariates_SV2_lfcShrink.txt"))
res_DESeq2$p_permuted = read.table(file.path(scz_working_dir, "results/SCZ_step1_DESeq2_bulk_adj_covariates_SV2_permuted.txt"))
pval = -log10(res_DESeq2$p[, "pvalue"])
# plot distribution of p-values
opar = par(mfrow=c(1,2), bty="n", mar=c(5,4,2,1))
hist(res_DESeq2$p[, "pvalue"], main="DESeq2 bulk", breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_DESeq2$p[, "pvalue"]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))
hist(res_DESeq2$p_permuted[, "pvalue"], main="DESeq2 bulk permuted", breaks=20,
xlab = paste0("p-value (", sum(CARseq:::get_qvalues_one_inflated(res_DESeq2$p_permuted[, "pvalue"]) < 0.1, na.rm=TRUE), " genes q value < 0.1)"))# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_DESeq2$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.2872 0.5799 0.0663 2.8267 0.1701 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_DESeq2_GSEAMultilevel_REACTOME.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_reactome, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_reactome[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.## pathway pval padj log2err ES NES size
## 1: REACTOME_NEURONAL_SYSTEM 0.041 0.43 0.32 -0.074 -1.6 360
## leadingEdge
## 1: KCNH2,GNAI2,LRRC7,TUBB2A,AP2M1,PRKACB,...
pval_dir = file.path(autism_working_dir, "results/_pvalues")
if (!dir.exists(pval_dir)) dir.create(pval_dir)
for (cell_type in colnames(res_CARseq$p)) {
pvalues = data.frame(gene_id=rowData(rse_filtered)$gene_id,
gene_name=rowData(rse_filtered)$gene_name,
pvalue=res_CARseq$p[, cell_type])
write_delim(pvalues, path = file.path(pval_dir, sprintf("SCZ_CARseq_%s.txt", cell_type)))
}
for (cell_type in colnames(res_TOAST$p)) {
pvalues = data.frame(gene_id=rowData(rse_filtered)$gene_id,
gene_name=rowData(rse_filtered)$gene_name,
pvalue=res_TOAST$p[, cell_type])
write_delim(pvalues, path = file.path(pval_dir, sprintf("SCZ_TOAST_%s.txt", cell_type)))
}
pvalues = data.frame(gene_id=rowData(rse_filtered)$gene_id,
gene_name=rowData(rse_filtered)$gene_name,
pvalue=res_DESeq2$p[, "pvalue"])
write_delim(pvalues, path = file.path(pval_dir, "SCZ_DESeq2_bulk.txt"))Figure: Top 3 CARseq cell type-specific REACTOME pathways ranked by -log10 q value with the sign of NES in the SCZ study.
topPathways_table = NULL
for (cell_type in colnames(res_CARseq$p)) {
CARseq_filename = file.path(cache_dir, sprintf("scz_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("scz_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
CARseq_fgseaResPositive = CARseq_fgseaRes[CARseq_fgseaRes$NES > 0, ] # genes that are enriched in small p-values
CARseq_topPathways = CARseq_fgseaResPositive[order(CARseq_fgseaResPositive[,"padj"], -CARseq_fgseaResPositive[,"NES"]), ]$pathway
topPathways_table = rbind(topPathways_table,
data.frame(pathway=CARseq_topPathways,
cell_type=cell_type,
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(CARseq_fgseaRes$pval)
[match(CARseq_topPathways, CARseq_fgseaRes$pathway)]),
NES = CARseq_fgseaRes[,"NES"]
[match(CARseq_topPathways, CARseq_fgseaRes$pathway)],
CT_specific_rank=seq_along(CARseq_topPathways)))
}
topPathways_table$cell_type = factor(as.character(topPathways_table$cell_type), levels = colnames(res_CARseq$p))
# deduplicate to remove the same pathway with a less significant q value
topPathways_table = topPathways_table[order(topPathways_table$CT_specific_rank, -topPathways_table$log10qvalue, -topPathways_table$NES), ]
topPathways_table = topPathways_table[!duplicated(topPathways_table$pathway), ]
# remove pathways that are not in the top N of each cell type
N = 3
topN_Pathways_table = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
cell_type_pathway_table = topPathways_table[topPathways_table$cell_type == cell_type, ]
topN_Pathways_table = rbind(topN_Pathways_table, cell_type_pathway_table[1:N, ])
}
# create a matrix for log10 q values of GSEA results of REACTOME
# The first 6 columns are CARseq, and the next 6 columns are TOAST, and the last column is DESeq2
pathway_log10qvalue_matrix = matrix(NA, nrow = nrow(topN_Pathways_table), ncol = ncol(res_CARseq$p) * 2 + 1)
rownames(pathway_log10qvalue_matrix) = as.character(topN_Pathways_table$pathway)
colnames(pathway_log10qvalue_matrix) = c(paste("CARseq", colnames(res_CARseq$p)), paste("TOAST", colnames(res_CARseq$p)), "DESeq2 bulk")
# fill in the blanks for CARseq, TOAST, and DESeq2 log 10 q values
DESeq2_filename = file.path(cache_dir, "scz_DESeq2_GSEAMultilevel_REACTOME.rds")
DESeq2_fgseaRes = readRDS(DESeq2_filename)
indices_CARseq = match(rownames(pathway_log10qvalue_matrix), CARseq_fgseaRes$pathway)
indices_TOAST = match(rownames(pathway_log10qvalue_matrix), TOAST_fgseaRes$pathway)
indices_DESeq2 = match(rownames(pathway_log10qvalue_matrix), DESeq2_fgseaRes$pathway)
pathway_log10qvalue_matrix[, 1 + 2 * ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(DESeq2_fgseaRes$pval)) *
sign(DESeq2_fgseaRes$NES))[indices_DESeq2]
for (j in seq_len(ncol(res_CARseq$p))) {
cell_type = colnames(res_CARseq$p)[j]
CARseq_filename = file.path(cache_dir, sprintf("scz_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("scz_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
pathway_log10qvalue_matrix[, j] =
(-log10(CARseq:::get_qvalues_one_inflated(CARseq_fgseaRes$pval)) *
sign(CARseq_fgseaRes$NES))[indices_CARseq]
pathway_log10qvalue_matrix[, j + ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(TOAST_fgseaRes$pval)) *
sign(TOAST_fgseaRes$NES))[indices_TOAST]
}
write.csv(pathway_log10qvalue_matrix, file=file.path(cache_dir, "scz_gsea_CARseq_vs_TOAST_REACTOME_ranked_by_CARseq_heatmap.csv"), quote=FALSE)
# heatmap
# remove _ -> remove "REACTOME" -> wrap
rownames(pathway_log10qvalue_matrix) = str_wrap(sub("REACTOME", "", gsub("_", " ", rownames(pathway_log10qvalue_matrix))), 75)
ann_colors = list(TopPathwaysByCT = as.character(wes_palette(name="Darjeeling2", 6, type="continuous")),
Method = as.character(wes_palette("FantasticFox1", 3)),
CellType = c(as.character(wes_palette(name="Darjeeling2", 6, type="continuous")), "grey"))
names(ann_colors[["TopPathwaysByCT"]]) = colnames(res_CARseq$p)
names(ann_colors[["Method"]]) = c("CARseq", "TOAST", "DESeq2")
names(ann_colors[["CellType"]]) = c(colnames(res_CARseq$p), "bulk")
annotation_row = data.frame(TopPathwaysByCT=factor(rep(colnames(res_CARseq$p), each=N)))
rownames(annotation_row) = rownames(pathway_log10qvalue_matrix)
annotation_col = data.frame(Method=factor(c(rep("CARseq", ncol(res_CARseq$p)), rep("TOAST", ncol(res_CARseq$p)), "DESeq2"),
levels=c("CARseq", "TOAST", "DESeq2")),
CellType=factor(c(rep(colnames(res_CARseq$p), times=2), "bulk")))
rownames(annotation_col) = colnames(pathway_log10qvalue_matrix)
g_scz_4 = pheatmap(pathway_log10qvalue_matrix,
cluster_rows = F, cluster_cols = F,
border_color ="grey60", na_col = "white", angle_col = "90",
colorRampPalette(c("darkblue", "royalblue", "grey95", "grey95", "pink1", "darkred"))(8),
breaks = seq(-4, 4, by=1), legend_breaks = c(seq(-3, 3, by=1), 4),
legend_labels = c(seq(-3, 3, by=1), "signed\n-log10(q)"),
gaps_row = seq(N, N*ncol(res_CARseq$p), by = N),
gaps_col = c(ncol(res_CARseq$p), ncol(res_CARseq$p)*2),
annotation_row = annotation_row, annotation_col=annotation_col,
annotation_colors = ann_colors, fontsize_row = 8,
cellwidth = 12, cellheight = 10)
g_scz_4Figure: Top 3 TOAST cell type-specific REACTOME pathways ranked by -log10 q value with the sign of NES in the SCZ study.
topPathways_table = NULL
for (cell_type in colnames(res_CARseq$p)) {
CARseq_filename = file.path(cache_dir, sprintf("scz_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("scz_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
TOAST_fgseaResPositive = TOAST_fgseaRes[TOAST_fgseaRes$NES > 0, ] # genes that are enriched in small p-values
# TOAST_topPathways = head(TOAST_fgseaResPositive[order(TOAST_fgseaResPositive[,"padj"], -TOAST_fgseaResPositive[,"NES"]), ]$pathway, n = 100)
TOAST_topPathways = TOAST_fgseaResPositive[order(TOAST_fgseaResPositive[,"padj"], -TOAST_fgseaResPositive[,"NES"]), ]$pathway
topPathways_table = rbind(topPathways_table,
data.frame(pathway=TOAST_topPathways,
cell_type=cell_type,
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(TOAST_fgseaRes$pval)
[match(TOAST_topPathways, TOAST_fgseaRes$pathway)]),
NES = TOAST_fgseaRes[,"NES"]
[match(TOAST_topPathways, TOAST_fgseaRes$pathway)],
CT_specific_rank=seq_along(TOAST_topPathways)))
}
topPathways_table$cell_type = factor(as.character(topPathways_table$cell_type), levels = colnames(res_TOAST$p))
# deduplicate to remove the same pathway with a less significant q value
topPathways_table = topPathways_table[order(topPathways_table$CT_specific_rank, -topPathways_table$log10qvalue, -topPathways_table$NES), ]
topPathways_table = topPathways_table[!duplicated(topPathways_table$pathway), ]
# remove pathways that are not in the top N of each cell type
N = 3
topN_Pathways_table = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
cell_type_pathway_table = topPathways_table[topPathways_table$cell_type == cell_type, ]
topN_Pathways_table = rbind(topN_Pathways_table, cell_type_pathway_table[1:N, ])
}
# create a matrix for log10 q values of GSEA results of REACTOME
# The first 6 columns are CARseq, and the next 6 columns are TOAST, and the last column is DESeq2
pathway_log10qvalue_matrix = matrix(NA, nrow = nrow(topN_Pathways_table), ncol = ncol(res_CARseq$p) * 2 + 1)
rownames(pathway_log10qvalue_matrix) = as.character(topN_Pathways_table$pathway)
colnames(pathway_log10qvalue_matrix) = c(paste("CARseq", colnames(res_CARseq$p)), paste("TOAST", colnames(res_CARseq$p)), "DESeq2 bulk")
# fill in the blanks for CARseq, TOAST, and DESeq2 log 10 q values
DESeq2_filename = file.path(cache_dir, "scz_DESeq2_GSEAMultilevel_REACTOME.rds")
DESeq2_fgseaRes = readRDS(DESeq2_filename)
indices_CARseq = match(rownames(pathway_log10qvalue_matrix), CARseq_fgseaRes$pathway)
indices_TOAST = match(rownames(pathway_log10qvalue_matrix), TOAST_fgseaRes$pathway)
indices_DESeq2 = match(rownames(pathway_log10qvalue_matrix), DESeq2_fgseaRes$pathway)
pathway_log10qvalue_matrix[, 1 + 2 * ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(DESeq2_fgseaRes$pval)) *
sign(DESeq2_fgseaRes$NES))[indices_DESeq2]
for (j in seq_len(ncol(res_CARseq$p))) {
cell_type = colnames(res_CARseq$p)[j]
CARseq_filename = file.path(cache_dir, sprintf("scz_CARseq_GSEAMultilevel_REACTOME_%s.rds", cell_type))
TOAST_filename = file.path(cache_dir, sprintf("scz_TOAST_GSEAMultilevel_REACTOME_%s.rds", cell_type))
CARseq_fgseaRes = readRDS(CARseq_filename)
TOAST_fgseaRes = readRDS(TOAST_filename)
pathway_log10qvalue_matrix[, j] =
(-log10(CARseq:::get_qvalues_one_inflated(CARseq_fgseaRes$pval)) *
sign(CARseq_fgseaRes$NES))[indices_CARseq]
pathway_log10qvalue_matrix[, j + ncol(res_CARseq$p)] =
(-log10(CARseq:::get_qvalues_one_inflated(TOAST_fgseaRes$pval)) *
sign(TOAST_fgseaRes$NES))[indices_TOAST]
}
write.csv(pathway_log10qvalue_matrix, file=file.path(cache_dir, "scz_gsea_CARseq_vs_TOAST_REACTOME_ranked_by_TOAST_heatmap.csv"), quote=FALSE)
# heatmap
# remove _ -> remove "REACTOME" -> wrap
rownames(pathway_log10qvalue_matrix) = str_wrap(sub("REACTOME", "", gsub("_", " ", rownames(pathway_log10qvalue_matrix))), 60)
ann_colors = list(TopPathwaysByCT = as.character(wes_palette(name="Darjeeling2", 6, type="continuous")),
Method = as.character(wes_palette("FantasticFox1", 3)),
CellType = c(as.character(wes_palette(name="Darjeeling2", 6, type="continuous")), "grey"))
names(ann_colors[["TopPathwaysByCT"]]) = colnames(res_CARseq$p)
names(ann_colors[["Method"]]) = c("CARseq", "TOAST", "DESeq2")
names(ann_colors[["CellType"]]) = c(colnames(res_CARseq$p), "bulk")
annotation_row = data.frame(TopPathwaysByCT=factor(rep(colnames(res_CARseq$p), each=N)))
rownames(annotation_row) = rownames(pathway_log10qvalue_matrix)
annotation_col = data.frame(Method=factor(c(rep("CARseq", ncol(res_CARseq$p)), rep("TOAST", ncol(res_CARseq$p)), "DESeq2"),
levels=c("CARseq", "TOAST", "DESeq2")),
CellType=factor(c(rep(colnames(res_CARseq$p), times=2), "bulk")))
rownames(annotation_col) = colnames(pathway_log10qvalue_matrix)
pheatmap(pathway_log10qvalue_matrix,
cluster_rows = F, cluster_cols = F,
border_color ="grey60", na_col = "white", angle_col = "90",
colorRampPalette(c("darkblue", "royalblue", "grey95", "grey95", "pink1", "darkred"))(8),
breaks = seq(-4, 4, by=1), legend_breaks = c(seq(-3, 3, by=1), 4),
legend_labels = c(seq(-3, 3, by=1), "signed\n-log10(q)"),
gaps_row = seq(N, N*ncol(res_CARseq$p), by = N),
gaps_col = c(ncol(res_CARseq$p), ncol(res_CARseq$p)*2),
annotation_row = annotation_row, annotation_col=annotation_col,
annotation_colors = ann_colors, fontsize_row = 8,
cellwidth = 12, cellheight = 15)We use “classic” weight where no weight is added. The genes are ranked by p-values. The gene matching is done in gene symbol. There is a lot of genes with p-values being 1, which “may or may not cause trouble”, so we only care about the genes that are enriched in small p-values:
# Running fgsea:
pval = -log10(res_CARseq$p[, "Astro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.8858 0.4086 0.6988 0.0764 0.297 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_BP_Astro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Exc"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0 0 0.497 0.118 0.176 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_BP_Exc.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Inh"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0 0.06 0.232 0.122 0 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_BP_Inh.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Oligo"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.976 0.516 0 0.375 0 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_BP_Oligo.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "Micro"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0.496 0.429 2.033 0 0.484 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_BP_Micro.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.# Running fgsea:
pval = -log10(res_CARseq$p[, "OPC"])
# Loading stats:
stats = pval
names(stats) = sub("\\.[0-9]*", "", rownames(res_CARseq$p)) # ensembl id
names(stats) = rowData(rse_filtered)$gene_name # gene symbol
str(stats)## Named num [1:20788] 0 0.314 1.854 0.638 0 ...
## - attr(*, "names")= chr [1:20788] "TSPAN6" "DPM1" "SCYL3" "C1orf112" ...
## [1] 20788
stats = stats[unique(names(stats))] # deduplicate by only taking the highest one of possible duplicated gene symbols
length(stats)## [1] 20635
stats = na.omit(stats)
# And running fgsea:
filename = file.path(cache_dir, "scz_CARseq_GSEAMultilevel_BP_OPC.rds")
if (!file.exists(filename)) {
set.seed(1234)
fgseaRes = fgseaMultilevel(pathways_go_bp, stats, minSize=10, maxSize=1000, gseaParam = 0, eps = 0)
saveRDS(fgseaRes, filename)
write.csv(fgseaRes[,1:7], sub("rds", "csv", filename))
} else {
fgseaRes = readRDS(filename)
}
# head(fgseaRes[order(fgseaRes[,"pval"]), ])
fgseaResPositive = fgseaRes[fgseaRes$NES > 0, ] # genes that are enriched in small p-values
topPathways = head(fgseaResPositive[order(fgseaResPositive[,"padj"], -fgseaResPositive[,"NES"]), ]$pathway, n = 10)
plotGseaTable(pathways_go_bp[topPathways], stats, fgseaResPositive,
gseaParam = 0.5) # The 0.5 is only used for illustration purposes.scRNAseq_sig = readRDS(file.path(scz_working_dir, "./MTG/all_genes_MTG.rds"))
scRNAseq_sig$sig_matched = scRNAseq_sig$SIG[match(rowData(rse_filtered)$gene_name, rownames(scRNAseq_sig$SIG)), ]
colnames(scRNAseq_sig$sig_matched) = paste("scRNAseq", colnames(scRNAseq_sig$sig_matched)) # expression from MTG scRNAseq
dim(scRNAseq_sig$sig_matched)## [1] 20788 6
Plot a heatmap of correlation matrix
# Spearman correlation
# With infinite LFCs
res_CARseq_coefficients = res_CARseq$coefficients[, grep("Control|SCZ", colnames(res_CARseq$coefficients))]
cor(res_CARseq_coefficients, method="spearman", use="pairwise.complete.obs")[1:6,7:12]## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## Control:Astro 0.851 0.44 0.033 0.45 0.48 0.30
## Control:Exc 0.429 0.99 0.652 0.21 0.52 0.37
## Control:Inh 0.065 0.63 0.880 -0.12 0.23 0.31
## Control:Micro 0.437 0.20 -0.134 0.55 0.28 0.10
## Control:Oligo 0.533 0.52 0.183 0.40 0.80 0.25
## Control:OPC 0.271 0.40 0.402 -0.03 0.26 0.71
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.62 0.56 0.28 0.217
## scRNAseq Exc 0.30 0.82 0.57 0.091
## scRNAseq Inh 0.35 0.78 0.56 0.120
## scRNAseq Micro 0.39 0.46 0.19 0.372
## scRNAseq Oligo 0.44 0.63 0.31 0.202
## scRNAseq OPC 0.47 0.65 0.39 0.188
## Control:Oligo Control:OPC
## scRNAseq Astro 0.53 0.27
## scRNAseq Exc 0.40 0.35
## scRNAseq Inh 0.44 0.31
## scRNAseq Micro 0.46 0.17
## scRNAseq Oligo 0.72 0.23
## scRNAseq OPC 0.52 0.31
## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## scRNAseq Astro 0.60 0.56 0.27 0.24 0.52 0.27
## scRNAseq Exc 0.30 0.82 0.57 0.12 0.41 0.32
## scRNAseq Inh 0.34 0.78 0.57 0.15 0.45 0.28
## scRNAseq Micro 0.37 0.45 0.20 0.42 0.44 0.17
## scRNAseq Oligo 0.41 0.63 0.32 0.26 0.71 0.22
## scRNAseq OPC 0.44 0.65 0.39 0.22 0.52 0.31
# With shrunken LFCs -- note that the shrunken LFCs are not very good
res_CARseq_shrunken_coefficients = res_CARseq$shrunken_coefficients[, grep("Control|SCZ", colnames(res_CARseq$shrunken_coefficients))]
res_CARseq_shrunken_coefficients[] = c(res_CARseq_shrunken_coefficients) + c(res_CARseq$shrunken_coefficients[, grep("intercept", colnames(res_CARseq$shrunken_coefficients))])
cor(res_CARseq_shrunken_coefficients, method="spearman", use="pairwise.complete.obs")[1:6,7:12]## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## Control:Astro 0.99 0.49 0.111 0.525 0.56 0.37
## Control:Exc 0.48 1.00 0.676 0.336 0.56 0.50
## Control:Inh 0.11 0.67 0.991 -0.053 0.27 0.43
## Control:Micro 0.52 0.34 -0.059 0.985 0.39 0.14
## Control:Oligo 0.57 0.57 0.263 0.435 0.98 0.34
## Control:OPC 0.37 0.50 0.444 0.123 0.33 0.99
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.66 0.56 0.31 0.31
## scRNAseq Exc 0.34 0.82 0.60 0.19
## scRNAseq Inh 0.39 0.78 0.59 0.22
## scRNAseq Micro 0.41 0.46 0.20 0.48
## scRNAseq Oligo 0.48 0.63 0.34 0.32
## scRNAseq OPC 0.50 0.65 0.42 0.29
## Control:Oligo Control:OPC
## scRNAseq Astro 0.56 0.35
## scRNAseq Exc 0.44 0.42
## scRNAseq Inh 0.48 0.38
## scRNAseq Micro 0.48 0.22
## scRNAseq Oligo 0.75 0.29
## scRNAseq OPC 0.56 0.39
## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## scRNAseq Astro 0.65 0.56 0.30 0.32 0.55 0.35
## scRNAseq Exc 0.33 0.82 0.60 0.20 0.44 0.42
## scRNAseq Inh 0.38 0.78 0.60 0.23 0.48 0.38
## scRNAseq Micro 0.40 0.46 0.21 0.48 0.47 0.22
## scRNAseq Oligo 0.46 0.63 0.35 0.33 0.75 0.29
## scRNAseq OPC 0.49 0.65 0.42 0.29 0.55 0.39
# With infinite LFCs as NAs
res_CARseq_coefficients_as_NAs = res_CARseq$coefficients[, grep("Control|SCZ", colnames(res_CARseq$coefficients))]
res_CARseq_coefficients_as_NAs[!is.finite(res_CARseq_coefficients_as_NAs)] = NA
cor(res_CARseq_coefficients_as_NAs, method="spearman", use="pairwise.complete.obs")[1:6,7:12]## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## Control:Astro 0.93 0.78 0.67 0.74 0.77 0.72
## Control:Exc 0.76 0.99 0.87 0.67 0.78 0.81
## Control:Inh 0.67 0.87 0.95 0.65 0.71 0.74
## Control:Micro 0.73 0.65 0.59 0.81 0.67 0.63
## Control:Oligo 0.76 0.79 0.70 0.70 0.92 0.70
## Control:OPC 0.72 0.81 0.74 0.64 0.70 0.89
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.67 0.57 0.42 0.41
## scRNAseq Exc 0.55 0.82 0.68 0.37
## scRNAseq Inh 0.57 0.78 0.67 0.38
## scRNAseq Micro 0.45 0.47 0.34 0.49
## scRNAseq Oligo 0.55 0.63 0.48 0.40
## scRNAseq OPC 0.60 0.65 0.52 0.40
## Control:Oligo Control:OPC
## scRNAseq Astro 0.58 0.51
## scRNAseq Exc 0.57 0.61
## scRNAseq Inh 0.59 0.59
## scRNAseq Micro 0.49 0.40
## scRNAseq Oligo 0.77 0.51
## scRNAseq OPC 0.61 0.54
## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## scRNAseq Astro 0.65 0.57 0.43 0.43 0.57 0.50
## scRNAseq Exc 0.53 0.82 0.68 0.40 0.56 0.60
## scRNAseq Inh 0.55 0.78 0.68 0.41 0.58 0.58
## scRNAseq Micro 0.43 0.47 0.35 0.51 0.46 0.39
## scRNAseq Oligo 0.53 0.63 0.49 0.43 0.75 0.50
## scRNAseq OPC 0.58 0.65 0.53 0.43 0.60 0.54
# With infinite LFCs as NAs & Pearson correlation
cor(res_CARseq_coefficients_as_NAs, method="pearson", use="pairwise.complete.obs")[1:6,7:12]## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## Control:Astro 0.90 0.75 0.64 0.69 0.74 0.67
## Control:Exc 0.73 0.99 0.86 0.64 0.75 0.75
## Control:Inh 0.64 0.85 0.93 0.60 0.67 0.69
## Control:Micro 0.70 0.62 0.56 0.77 0.65 0.58
## Control:Oligo 0.72 0.75 0.67 0.65 0.89 0.65
## Control:OPC 0.67 0.77 0.70 0.59 0.65 0.82
## Control:Astro Control:Exc Control:Inh Control:Micro
## scRNAseq Astro 0.66 0.56 0.42 0.40
## scRNAseq Exc 0.52 0.81 0.67 0.36
## scRNAseq Inh 0.55 0.77 0.66 0.36
## scRNAseq Micro 0.41 0.43 0.31 0.47
## scRNAseq Oligo 0.53 0.61 0.47 0.39
## scRNAseq OPC 0.58 0.64 0.51 0.39
## Control:Oligo Control:OPC
## scRNAseq Astro 0.56 0.49
## scRNAseq Exc 0.55 0.58
## scRNAseq Inh 0.56 0.56
## scRNAseq Micro 0.45 0.37
## scRNAseq Oligo 0.74 0.48
## scRNAseq OPC 0.59 0.52
## SCZ:Astro SCZ:Exc SCZ:Inh SCZ:Micro SCZ:Oligo SCZ:OPC
## scRNAseq Astro 0.64 0.56 0.42 0.41 0.56 0.47
## scRNAseq Exc 0.51 0.81 0.67 0.38 0.54 0.57
## scRNAseq Inh 0.53 0.77 0.67 0.39 0.55 0.56
## scRNAseq Micro 0.41 0.43 0.32 0.49 0.43 0.35
## scRNAseq Oligo 0.51 0.62 0.48 0.41 0.73 0.48
## scRNAseq OPC 0.56 0.64 0.52 0.41 0.58 0.51
# plot
# library(pheatmap)
cor_all = cor(cbind(res_CARseq_coefficients_as_NAs, log(1 + scRNAseq_sig$sig_matched)), method="pearson", use="pairwise.complete.obs")
# cor_all[lower.tri(cor_all, diag=TRUE)] = NA
cor_all = cor_all[1:12, 7:18]
cor_all[7:12, 1:6] = NA
cor_all_as_character = matrix(sprintf("%.2f", cor_all), nrow=nrow(cor_all))
cor_all_as_character[cor_all_as_character == "NA"] = ""
pheatmap(cor_all, cluster_rows = F, cluster_cols = F, display_numbers = cor_all_as_character, border_color = NA, na_col = "white", angle_col = "90", colorRampPalette(c("blue", "royalblue", "white", "pink", "red"))(21), breaks = seq(-1, 1, by=0.1), gaps_row = 6, gaps_col = 6)# shrunken
# not as good since the previously infinite estimates (signifying poor fit of the model) are now
# around 0 after shrinkage, which will be included in the calculation of corr now.
cor_all_shrunken = cor(cbind(res_CARseq_shrunken_coefficients, log(1 + scRNAseq_sig$sig_matched)), method="pearson", use="pairwise.complete.obs")
# cor_all_shrunken[lower.tri(cor_all_shrunken, diag=TRUE)] = NA
cor_all_shrunken = cor_all_shrunken[1:12, 7:18]
cor_all_shrunken[7:12, 1:6] = NA
cor_all_shrunken_as_character = matrix(sprintf("%.2f", cor_all_shrunken), nrow=nrow(cor_all_shrunken))
cor_all_shrunken_as_character[cor_all_shrunken_as_character == "NA"] = ""
pheatmap(cor_all_shrunken, cluster_rows = F, cluster_cols = F, display_numbers = cor_all_shrunken_as_character, border_color = NA, na_col = "white", angle_col = "90", colorRampPalette(c("blue", "royalblue", "white", "pink", "red"))(21), breaks = seq(-1, 1, by=0.1), gaps_row = 6, gaps_col = 6)The samples sequenced in Pitt has a higher read depth than in MSSM and in Penn.
Figure: Cellular fractions in CommonMind Consortium SCZ and control samples deconvoluted using ICeDT. The samples are sorted by increasing cellular fractions in Exc among each group.
# library(wesanderson)
cellular_proportions = readRDS(file.path(scz_working_dir, "data/SCZ_prop.rds"))
cellular_proportions$ICeDT = cellular_proportions$ICeDT[colnames(rse_filtered), ]
cellular_proportions$CIBERSORT = cellular_proportions$CIBERSORT[colnames(rse_filtered), ]
cellular_proportions_longtable =
cellular_proportions$ICeDT[colnames(rse_filtered), ] %>% as.data.frame() %>%
rownames_to_column(var = "samples") %>%
add_column(group = colData(rse_filtered)$Project.ID) %>%
add_column(Exc_prop = cellular_proportions$ICeDT[,"Exc"]) %>%
pivot_longer(cols = Astro:OPC, names_to = "cell type", values_to = "cellular proportions")
# There are three centers: MSSM, Penn, and Pitt.
colData(rse_filtered)$Institution = "MSSM"
colData(rse_filtered)$Institution[colData(rse_filtered)$InstitutionPenn == 1] = "Penn"
colData(rse_filtered)$Institution[colData(rse_filtered)$InstitutionPitt == 1] = "Pitt"
opar = par(cex.axis = 0.7)
boxplot(log_depth~Institution+Project.ID, colData(rse_filtered))par(opar)
g_scz_1_SCZ = ggplot(data=cellular_proportions_longtable %>% subset(group %in% "SCZ"), aes(x=reorder(samples, Exc_prop), y=`cellular proportions`, fill=`cell type`)) +
geom_bar(stat = "identity", width=1) +
ggtitle("SCZ samples") +
xlab("sample") +
scale_fill_manual(values = wes_palette(6, name="Darjeeling2", type="continuous")) +
theme_minimal() +
theme(panel.grid.major = element_blank(), axis.text.x = element_blank(), legend.position = "none")
g_scz_1_Control = ggplot(data=cellular_proportions_longtable %>% subset(group %in% "Control"), aes(x=reorder(samples, Exc_prop), y=`cellular proportions`, fill=`cell type`)) +
geom_bar(stat = "identity", width=1) +
ggtitle("Control samples") +
xlab("sample") +
scale_fill_manual(values = wes_palette(6, name="Darjeeling2", type="continuous")) +
theme_minimal() +
theme(panel.grid.major = element_blank(), axis.text.x = element_blank())
g_scz_1 = ggarrange(g_scz_1_SCZ, g_scz_1_Control, ncol = 2, widths = c(4,5), nrow = 1)
g_scz_1# log ratio against Exc
summary_ICeDT = list()
summary_ICeDT[["Astro"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Astro"] / (1e-2 + cellular_proportions$ICeDT[,"Exc"])))~ Project.ID + log_depth + Institution +
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_ICeDT[["Astro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Astro"]/(0.01 +
## cellular_proportions$ICeDT[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5322 -0.1492 0.0036 0.1585 0.9275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.02924 0.40888 -2.52 0.0121 *
## Project.IDSCZ -0.00342 0.02684 -0.13 0.8986
## log_depth -0.03112 0.06031 -0.52 0.6061
## InstitutionPenn -0.06096 0.04183 -1.46 0.1457
## InstitutionPitt -0.04722 0.04078 -1.16 0.2474
## genderMale -0.03276 0.02683 -1.22 0.2225
## scaled_age_death 0.03355 0.01776 1.89 0.0594 .
## scaled_PMI -0.00492 0.01377 -0.36 0.7208
## scaled_RIN -1.75892 0.20320 -8.66 < 2e-16 ***
## scaled_RIN2 1.72971 0.20694 8.36 6.2e-16 ***
## genoPC1 -0.01371 0.01242 -1.10 0.2700
## genoPC2 -0.00479 0.01346 -0.36 0.7220
## sv1 0.15363 0.02782 5.52 5.4e-08 ***
## sv2 -0.06526 0.01984 -3.29 0.0011 **
## libclustB -0.00609 0.04645 -0.13 0.8958
## libclustbase -0.42807 0.07768 -5.51 5.7e-08 ***
## libclustC -0.16370 0.05323 -3.08 0.0022 **
## libclustD -0.03813 0.04832 -0.79 0.4304
## libclustE -0.18536 0.04943 -3.75 0.0002 ***
## libclustF -0.26715 0.05189 -5.15 3.8e-07 ***
## libclustG -0.17762 0.06730 -2.64 0.0086 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.28 on 506 degrees of freedom
## Multiple R-squared: 0.511, Adjusted R-squared: 0.491
## F-statistic: 26.4 on 20 and 506 DF, p-value: <2e-16
summary_ICeDT[["Inh"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Inh"] / (1e-2 + cellular_proportions$ICeDT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_ICeDT[["Inh"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Inh"]/(0.01 +
## cellular_proportions$ICeDT[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0228 -0.1290 0.0129 0.1442 0.7195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.610153 0.325259 -4.95 1.0e-06 ***
## Project.IDSCZ 0.093373 0.021349 4.37 1.5e-05 ***
## log_depth 0.000978 0.047975 0.02 0.98375
## InstitutionPenn -0.333423 0.033276 -10.02 < 2e-16 ***
## InstitutionPitt 0.052342 0.032437 1.61 0.10722
## genderMale 0.046011 0.021340 2.16 0.03155 *
## scaled_age_death -0.041084 0.014126 -2.91 0.00379 **
## scaled_PMI 0.019073 0.010955 1.74 0.08227 .
## scaled_RIN 0.287125 0.161644 1.78 0.07629 .
## scaled_RIN2 -0.304332 0.164615 -1.85 0.06508 .
## genoPC1 0.010455 0.009876 1.06 0.29031
## genoPC2 0.003227 0.010704 0.30 0.76319
## sv1 -0.108941 0.022133 -4.92 1.2e-06 ***
## sv2 0.109951 0.015781 6.97 1.0e-11 ***
## libclustB -0.096146 0.036947 -2.60 0.00953 **
## libclustbase -0.216526 0.061793 -3.50 0.00050 ***
## libclustC -0.156605 0.042341 -3.70 0.00024 ***
## libclustD -0.104104 0.038436 -2.71 0.00699 **
## libclustE -0.135357 0.039323 -3.44 0.00062 ***
## libclustF -0.182957 0.041278 -4.43 1.1e-05 ***
## libclustG -0.084518 0.053537 -1.58 0.11503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.22 on 506 degrees of freedom
## Multiple R-squared: 0.489, Adjusted R-squared: 0.468
## F-statistic: 24.2 on 20 and 506 DF, p-value: <2e-16
summary_ICeDT[["Micro"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Micro"] / (1e-2 + cellular_proportions$ICeDT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_ICeDT[["Micro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Micro"]/(0.01 +
## cellular_proportions$ICeDT[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9180 -0.2581 -0.0276 0.2234 1.4522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.99428 0.59834 -5.00 7.7e-07 ***
## Project.IDSCZ 0.00160 0.03927 0.04 0.96757
## log_depth 0.00457 0.08825 0.05 0.95870
## InstitutionPenn 0.00103 0.06121 0.02 0.98664
## InstitutionPitt -0.20679 0.05967 -3.47 0.00057 ***
## genderMale -0.01328 0.03926 -0.34 0.73523
## scaled_age_death 0.09730 0.02599 3.74 0.00020 ***
## scaled_PMI -0.08512 0.02015 -4.22 2.8e-05 ***
## scaled_RIN -0.34636 0.29736 -1.16 0.24465
## scaled_RIN2 0.38682 0.30282 1.28 0.20205
## genoPC1 -0.01662 0.01817 -0.91 0.36074
## genoPC2 0.03537 0.01969 1.80 0.07305 .
## sv1 0.04290 0.04072 1.05 0.29255
## sv2 -0.09961 0.02903 -3.43 0.00065 ***
## libclustB -0.04228 0.06797 -0.62 0.53414
## libclustbase -0.36647 0.11367 -3.22 0.00135 **
## libclustC -0.28105 0.07789 -3.61 0.00034 ***
## libclustD -0.04112 0.07071 -0.58 0.56112
## libclustE -0.16132 0.07234 -2.23 0.02618 *
## libclustF -0.27300 0.07593 -3.60 0.00036 ***
## libclustG -0.23322 0.09849 -2.37 0.01826 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4 on 506 degrees of freedom
## Multiple R-squared: 0.259, Adjusted R-squared: 0.23
## F-statistic: 8.85 on 20 and 506 DF, p-value: <2e-16
summary_ICeDT[["Oligo"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"Oligo"] / (1e-2 + cellular_proportions$ICeDT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_ICeDT[["Oligo"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "Oligo"]/(0.01 +
## cellular_proportions$ICeDT[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4068 -0.2629 -0.0021 0.2509 1.4306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.22570 0.63694 -3.49 0.00052 ***
## Project.IDSCZ -0.04135 0.04181 -0.99 0.32308
## log_depth 0.07504 0.09395 0.80 0.42484
## InstitutionPenn -0.14918 0.06516 -2.29 0.02247 *
## InstitutionPitt -0.46947 0.06352 -7.39 6.1e-13 ***
## genderMale 0.04283 0.04179 1.02 0.30592
## scaled_age_death 0.13074 0.02766 4.73 3.0e-06 ***
## scaled_PMI -0.08446 0.02145 -3.94 9.4e-05 ***
## scaled_RIN -1.50719 0.31654 -4.76 2.5e-06 ***
## scaled_RIN2 1.53692 0.32236 4.77 2.4e-06 ***
## genoPC1 -0.01794 0.01934 -0.93 0.35395
## genoPC2 0.00180 0.02096 0.09 0.93177
## sv1 0.11762 0.04334 2.71 0.00688 **
## sv2 0.01043 0.03090 0.34 0.73599
## libclustB 0.00681 0.07235 0.09 0.92500
## libclustbase -0.37437 0.12101 -3.09 0.00209 **
## libclustC -0.11489 0.08291 -1.39 0.16646
## libclustD -0.08911 0.07527 -1.18 0.23700
## libclustE -0.24939 0.07700 -3.24 0.00128 **
## libclustF -0.16345 0.08083 -2.02 0.04370 *
## libclustG -0.23064 0.10484 -2.20 0.02826 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.43 on 506 degrees of freedom
## Multiple R-squared: 0.471, Adjusted R-squared: 0.45
## F-statistic: 22.5 on 20 and 506 DF, p-value: <2e-16
summary_ICeDT[["OPC"]]=summary(lm(log((1e-2 + cellular_proportions$ICeDT[,"OPC"] / (1e-2 + cellular_proportions$ICeDT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_ICeDT[["OPC"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$ICeDT[, "OPC"]/(0.01 +
## cellular_proportions$ICeDT[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9205 -0.1907 -0.0156 0.1826 0.9369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.46011 0.43560 -5.65 2.7e-08 ***
## Project.IDSCZ 0.00810 0.02859 0.28 0.77702
## log_depth 0.02981 0.06425 0.46 0.64285
## InstitutionPenn -0.31722 0.04456 -7.12 3.8e-12 ***
## InstitutionPitt 0.01219 0.04344 0.28 0.77913
## genderMale -0.00309 0.02858 -0.11 0.91391
## scaled_age_death -0.07577 0.01892 -4.00 7.1e-05 ***
## scaled_PMI -0.01595 0.01467 -1.09 0.27735
## scaled_RIN -1.78290 0.21648 -8.24 1.5e-15 ***
## scaled_RIN2 1.71382 0.22046 7.77 4.3e-14 ***
## genoPC1 -0.00981 0.01323 -0.74 0.45841
## genoPC2 0.00192 0.01433 0.13 0.89335
## sv1 0.27370 0.02964 9.23 < 2e-16 ***
## sv2 -0.01510 0.02113 -0.71 0.47521
## libclustB -0.02321 0.04948 -0.47 0.63919
## libclustbase -0.53436 0.08276 -6.46 2.5e-10 ***
## libclustC -0.20762 0.05670 -3.66 0.00028 ***
## libclustD -0.08991 0.05147 -1.75 0.08131 .
## libclustE -0.34694 0.05266 -6.59 1.1e-10 ***
## libclustF -0.37799 0.05528 -6.84 2.3e-11 ***
## libclustG -0.16641 0.07170 -2.32 0.02068 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.29 on 506 degrees of freedom
## Multiple R-squared: 0.626, Adjusted R-squared: 0.611
## F-statistic: 42.4 on 20 and 506 DF, p-value: <2e-16
# log ratio against Exc
summary_CIBERSORT = list()
summary_CIBERSORT[["Astro"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Astro"] / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"])))~ Project.ID + log_depth + Institution +
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_CIBERSORT[["Astro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Astro"]/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"]))) ~ Project.ID +
## log_depth + Institution + genderMale + scaled_age_death +
## scaled_PMI + scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 +
## sv1 + sv2 + libclustB + libclustbase + libclustC + libclustD +
## libclustE + libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3725 -0.2142 0.0067 0.1982 1.3322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.43646 0.59897 -2.40 0.0168 *
## Project.IDSCZ -0.04931 0.03932 -1.25 0.2103
## log_depth -0.07343 0.08835 -0.83 0.4063
## InstitutionPenn 0.11738 0.06128 1.92 0.0560 .
## InstitutionPitt -0.15979 0.05973 -2.68 0.0077 **
## genderMale -0.06265 0.03930 -1.59 0.1115
## scaled_age_death 0.11590 0.02601 4.46 1.0e-05 ***
## scaled_PMI -0.03569 0.02017 -1.77 0.0775 .
## scaled_RIN -2.35496 0.29767 -7.91 1.6e-14 ***
## scaled_RIN2 2.33238 0.30314 7.69 7.5e-14 ***
## genoPC1 -0.01722 0.01819 -0.95 0.3441
## genoPC2 -0.00768 0.01971 -0.39 0.6968
## sv1 0.13226 0.04076 3.24 0.0013 **
## sv2 -0.18869 0.02906 -6.49 2.0e-10 ***
## libclustB 0.03553 0.06804 0.52 0.6017
## libclustbase -0.25918 0.11379 -2.28 0.0232 *
## libclustC -0.03068 0.07797 -0.39 0.6942
## libclustD 0.02950 0.07078 0.42 0.6770
## libclustE -0.07308 0.07241 -1.01 0.3133
## libclustF -0.18486 0.07601 -2.43 0.0154 *
## libclustG -0.16387 0.09859 -1.66 0.0971 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4 on 506 degrees of freedom
## Multiple R-squared: 0.461, Adjusted R-squared: 0.439
## F-statistic: 21.6 on 20 and 506 DF, p-value: <2e-16
summary_CIBERSORT[["Inh"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Inh"] / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_CIBERSORT[["Inh"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Inh"]/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"]))) ~ Project.ID +
## log_depth + Institution + genderMale + scaled_age_death +
## scaled_PMI + scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 +
## sv1 + sv2 + libclustB + libclustbase + libclustC + libclustD +
## libclustE + libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9729 -0.2405 -0.0169 0.2365 1.5883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.1371 0.5382 -5.83 1.0e-08 ***
## Project.IDSCZ 0.0549 0.0353 1.55 0.121
## log_depth -0.0681 0.0794 -0.86 0.391
## InstitutionPenn -0.1009 0.0551 -1.83 0.068 .
## InstitutionPitt -0.0546 0.0537 -1.02 0.309
## genderMale 0.0305 0.0353 0.86 0.388
## scaled_age_death -0.0321 0.0234 -1.37 0.171
## scaled_PMI 0.0275 0.0181 1.51 0.131
## scaled_RIN 1.1572 0.2675 4.33 1.8e-05 ***
## scaled_RIN2 -1.1344 0.2724 -4.16 3.7e-05 ***
## genoPC1 0.0310 0.0163 1.89 0.059 .
## genoPC2 0.0162 0.0177 0.92 0.360
## sv1 -0.2647 0.0366 -7.23 1.8e-12 ***
## sv2 -0.0101 0.0261 -0.39 0.700
## libclustB -0.0687 0.0611 -1.12 0.261
## libclustbase 0.2578 0.1023 2.52 0.012 *
## libclustC -0.0438 0.0701 -0.62 0.532
## libclustD -0.1011 0.0636 -1.59 0.113
## libclustE 0.1641 0.0651 2.52 0.012 *
## libclustF 0.1114 0.0683 1.63 0.104
## libclustG -0.0868 0.0886 -0.98 0.327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.36 on 506 degrees of freedom
## Multiple R-squared: 0.44, Adjusted R-squared: 0.418
## F-statistic: 19.9 on 20 and 506 DF, p-value: <2e-16
summary_CIBERSORT[["Micro"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Micro"] / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_CIBERSORT[["Micro"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Micro"]/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"]))) ~ Project.ID +
## log_depth + Institution + genderMale + scaled_age_death +
## scaled_PMI + scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 +
## sv1 + sv2 + libclustB + libclustbase + libclustC + libclustD +
## libclustE + libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9907 -0.2914 -0.0474 0.2640 1.7574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.226452 0.657919 -6.42 3.1e-10 ***
## Project.IDSCZ -0.014116 0.043185 -0.33 0.7439
## log_depth 0.015723 0.097042 0.16 0.8714
## InstitutionPenn 0.218228 0.067309 3.24 0.0013 **
## InstitutionPitt -0.302782 0.065611 -4.61 5.0e-06 ***
## genderMale 0.008336 0.043166 0.19 0.8470
## scaled_age_death 0.141637 0.028573 4.96 9.8e-07 ***
## scaled_PMI -0.091422 0.022159 -4.13 4.3e-05 ***
## scaled_RIN 0.157403 0.326966 0.48 0.6304
## scaled_RIN2 -0.072364 0.332975 -0.22 0.8280
## genoPC1 -0.032888 0.019977 -1.65 0.1003
## genoPC2 0.046244 0.021651 2.14 0.0332 *
## sv1 -0.049280 0.044770 -1.10 0.2715
## sv2 -0.072090 0.031920 -2.26 0.0243 *
## libclustB 0.061674 0.074734 0.83 0.4096
## libclustbase 0.000541 0.124993 0.00 0.9965
## libclustC -0.137123 0.085645 -1.60 0.1100
## libclustD 0.167953 0.077746 2.16 0.0312 *
## libclustE 0.102931 0.079540 1.29 0.1962
## libclustF 0.020950 0.083495 0.25 0.8020
## libclustG 0.043929 0.108292 0.41 0.6852
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.44 on 506 degrees of freedom
## Multiple R-squared: 0.281, Adjusted R-squared: 0.252
## F-statistic: 9.88 on 20 and 506 DF, p-value: <2e-16
summary_CIBERSORT[["Oligo"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"Oligo"] / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_CIBERSORT[["Oligo"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "Oligo"]/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"]))) ~ Project.ID +
## log_depth + Institution + genderMale + scaled_age_death +
## scaled_PMI + scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 +
## sv1 + sv2 + libclustB + libclustbase + libclustC + libclustD +
## libclustE + libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8887 -0.2976 0.0095 0.2682 1.7694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.573253 0.738069 -3.49 0.00053 ***
## Project.IDSCZ -0.076279 0.048445 -1.57 0.11599
## log_depth 0.063216 0.108864 0.58 0.56171
## InstitutionPenn 0.061371 0.075509 0.81 0.41673
## InstitutionPitt -0.542013 0.073604 -7.36 7.3e-13 ***
## genderMale 0.013386 0.048425 0.28 0.78233
## scaled_age_death 0.172667 0.032054 5.39 1.1e-07 ***
## scaled_PMI -0.116374 0.024858 -4.68 3.7e-06 ***
## scaled_RIN -1.508260 0.366798 -4.11 4.6e-05 ***
## scaled_RIN2 1.554220 0.373539 4.16 3.7e-05 ***
## genoPC1 -0.042684 0.022411 -1.90 0.05740 .
## genoPC2 0.000562 0.024289 0.02 0.98156
## sv1 0.070403 0.050224 1.40 0.16160
## sv2 -0.093178 0.035809 -2.60 0.00954 **
## libclustB 0.074034 0.083839 0.88 0.37763
## libclustbase 0.026302 0.140220 0.19 0.85129
## libclustC 0.120903 0.096078 1.26 0.20883
## libclustD 0.098735 0.087217 1.13 0.25814
## libclustE -0.007422 0.089230 -0.08 0.93374
## libclustF 0.093449 0.093667 1.00 0.31891
## libclustG -0.114427 0.121485 -0.94 0.34669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5 on 506 degrees of freedom
## Multiple R-squared: 0.424, Adjusted R-squared: 0.401
## F-statistic: 18.6 on 20 and 506 DF, p-value: <2e-16
summary_CIBERSORT[["OPC"]]=summary(lm(log((1e-2 + cellular_proportions$CIBERSORT[,"OPC"] / (1e-2 + cellular_proportions$CIBERSORT[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_CIBERSORT[["OPC"]]##
## Call:
## lm(formula = log((0.01 + cellular_proportions$CIBERSORT[, "OPC"]/(0.01 +
## cellular_proportions$CIBERSORT[, "Exc"]))) ~ Project.ID +
## log_depth + Institution + genderMale + scaled_age_death +
## scaled_PMI + scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 +
## sv1 + sv2 + libclustB + libclustbase + libclustC + libclustD +
## libclustE + libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5071 -0.1335 -0.0552 0.0277 2.1434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.91273 0.48439 -10.14 < 2e-16 ***
## Project.IDSCZ -0.02433 0.03179 -0.77 0.444
## log_depth 0.05963 0.07145 0.83 0.404
## InstitutionPenn 0.06641 0.04956 1.34 0.181
## InstitutionPitt -0.05867 0.04831 -1.21 0.225
## genderMale 0.03535 0.03178 1.11 0.267
## scaled_age_death 0.00943 0.02104 0.45 0.654
## scaled_PMI -0.03100 0.01631 -1.90 0.058 .
## scaled_RIN -1.09436 0.24073 -4.55 6.8e-06 ***
## scaled_RIN2 1.10884 0.24515 4.52 7.6e-06 ***
## genoPC1 -0.01514 0.01471 -1.03 0.304
## genoPC2 0.01035 0.01594 0.65 0.516
## sv1 0.08216 0.03296 2.49 0.013 *
## sv2 -0.06056 0.02350 -2.58 0.010 *
## libclustB 0.06877 0.05502 1.25 0.212
## libclustbase -0.15053 0.09203 -1.64 0.103
## libclustC -0.04427 0.06306 -0.70 0.483
## libclustD 0.02630 0.05724 0.46 0.646
## libclustE 0.02330 0.05856 0.40 0.691
## libclustF -0.05116 0.06147 -0.83 0.406
## libclustG -0.03265 0.07973 -0.41 0.682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.33 on 506 degrees of freedom
## Multiple R-squared: 0.143, Adjusted R-squared: 0.11
## F-statistic: 4.23 on 20 and 506 DF, p-value: 4.45e-09
# collect summary of cellular fraction linear models
summary_table = NULL
for (cell_type in names(summary_ICeDT)) {
summary_table = rbind(summary_table,
data.frame(method="ICeDT",
cell_type=cell_type,
diagnosis_effect=summary_ICeDT[[cell_type]]$coefficients[2, "Estimate"],
se=summary_ICeDT[[cell_type]]$coefficients[2, "Std. Error"],
pval=summary_ICeDT[[cell_type]]$coefficients[2, "Pr(>|t|)"]))
summary_table = rbind(summary_table,
data.frame(method="CIBERSORT",
cell_type=cell_type,
diagnosis_effect=summary_CIBERSORT[[cell_type]]$coefficients[2, "Estimate"],
se=summary_CIBERSORT[[cell_type]]$coefficients[2, "Std. Error"],
pval=summary_CIBERSORT[[cell_type]]$coefficients[2, "Pr(>|t|)"]))
}
summary_table$method = factor(summary_table$method, levels=c("ICeDT", "CIBERSORT"))
summary_table$pval_formatted = sprintf("%.3f", summary_table$pval)
summary_table## method cell_type diagnosis_effect se pval pval_formatted
## 1 ICeDT Astro -0.0034 0.027 9.0e-01 0.899
## 2 CIBERSORT Astro -0.0493 0.039 2.1e-01 0.210
## 3 ICeDT Inh 0.0934 0.021 1.5e-05 0.000
## 4 CIBERSORT Inh 0.0549 0.035 1.2e-01 0.121
## 5 ICeDT Micro 0.0016 0.039 9.7e-01 0.968
## 6 CIBERSORT Micro -0.0141 0.043 7.4e-01 0.744
## 7 ICeDT Oligo -0.0414 0.042 3.2e-01 0.323
## 8 CIBERSORT Oligo -0.0763 0.048 1.2e-01 0.116
## 9 ICeDT OPC 0.0081 0.029 7.8e-01 0.777
## 10 CIBERSORT OPC -0.0243 0.032 4.4e-01 0.444
# The errorbars overlapped, so use position_dodge to move them horizontally
pd = position_dodge(0.2) # move them x.x to the left and right
summary_table$cell_type = factor(summary_table$cell_type, levels = rev(levels(summary_table$cell_type)))
g_scz_2 = ggplot(summary_table, aes(x=cell_type, y=diagnosis_effect, colour=method, group=method)) +
geom_errorbar(aes(ymin=diagnosis_effect-se, ymax=diagnosis_effect+se), colour="black", width=.1, position=pd) +
# geom_line(position=pd) +
geom_point(position=pd, size=3, shape=21, fill="white") + # 21 is filled circle
xlab("Cell types") +
ylab("Difference of log ratio between case and control") +
scale_colour_hue(name="Deconvolution method", # Legend label, use darker colors
breaks=c("ICeDT", "CIBERSORT"),
labels=c("ICeDT", "CIBERSORT"),
l=40) + # Use darker colors, lightness=40
ggtitle(" Cellular fraction estimates\n on SCZ vs. Control\n adjusted for covariates") +
expand_limits(y=0) + # Expand y range
theme_bw() +
geom_hline(yintercept=0, linetype="dashed")
g_scz_2Figure: Volcano plot of cell-type specific differential expression between schizophrenia vs. control bulk RNA-seq samples inferred by CARseq. Each point represents a gene whose differential expression is significant (q value < 0.1) among a cell type. There are no genes making the cut in astrocytes or oligodendrocyte progenitor cells (OPCs). Genes with a q value < 0.01 are labeled. Exc, excitatory neurons; Inh, inhibitory neurons; Micro, microglia; Oligo, oligodendrocytes.
volcano_plot_data = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(res_CARseq$p[, cell_type]))
log10qvalue_significant = which(log10qvalue > -log10(0.1))
if (length(log10qvalue_significant) > 0) {
volcano_plot_data = rbind(volcano_plot_data,
data.frame(cell_type = cell_type,
gene_name = rowData(rse_filtered)$gene_name[log10qvalue_significant],
log10qvalue = log10qvalue[log10qvalue_significant],
LFC = res_CARseq$lfc[log10qvalue_significant, grep(cell_type, colnames(res_CARseq$lfc))]
))
}
}
dim(volcano_plot_data)## [1] 795 4
# remove the LFCs that are not finite
volcano_plot_data = volcano_plot_data[is.finite(volcano_plot_data$LFC), ]
dim(volcano_plot_data)## [1] 567 4
# show gene names on the plot when q value < 0.01
volcano_plot_data$gene_name_subset = as.character(volcano_plot_data$gene_name)
volcano_plot_data$gene_name_subset[rank(-volcano_plot_data$log10qvalue) > 10] = ""
ggplot(volcano_plot_data, aes(x=LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0)) +
geom_hline(yintercept=1, linetype="dashed") +
xlim(-max(abs(volcano_plot_data$LFC)), max(abs(volcano_plot_data$LFC))) +
geom_vline(xintercept=0) +
theme_minimal()volcano_plot_data = data.frame()
for (cell_type in colnames(res_CARseq$p)) {
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(res_CARseq$p[, cell_type]))
log10qvalue_significant = which(log10qvalue > -log10(0.1) & abs(res_CARseq$shrunken_lfc[, grep(cell_type, colnames(res_CARseq$shrunken_lfc))]) > 0.01)
if (length(log10qvalue_significant) > 0) {
volcano_plot_data = rbind(volcano_plot_data,
data.frame(cell_type = cell_type,
gene_name = rowData(rse_filtered)$gene_name[log10qvalue_significant],
log10qvalue = log10qvalue[log10qvalue_significant],
shrunken_LFC = res_CARseq$shrunken_lfc[log10qvalue_significant, grep(cell_type, colnames(res_CARseq$shrunken_lfc))]
))
}
}
dim(volcano_plot_data)## [1] 794 4
# remove the LFCs that are not finite
volcano_plot_data = volcano_plot_data[is.finite(volcano_plot_data$shrunken_LFC), ]
dim(volcano_plot_data)## [1] 794 4
volcano_plot_data$gene_name_subset = as.character(volcano_plot_data$gene_name)
# all cell types
volcano_plot = volcano_plot_data
if (nrow(volcano_plot) >= 10) {
volcano_plot$gene_name_subset[rank(-volcano_plot$log10qvalue) > 10] = ""
g_scz_3 = ggplot(volcano_plot, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0)) +
geom_hline(yintercept=1, linetype="dashed") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) + ylim(1, max(volcano_plot_data$log10qvalue)) +
geom_vline(xintercept=0) +
scale_color_manual(values = wes_palette(6, name="Darjeeling2", type="continuous"), limits = colnames(res_CARseq$p)) +
theme_minimal()
g_scz_3
}# neurons
volcano_plot_data_neuronal = subset(volcano_plot_data, (cell_type %in% c("Exc","Inh")))
if (nrow(volcano_plot_data_neuronal) >= 10) {
volcano_plot_data_neuronal$gene_name_subset[rank(-volcano_plot_data_neuronal$log10qvalue) > 10] = ""
ggplot(volcano_plot_data_neuronal, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0)) +
geom_hline(yintercept=1, linetype="dashed") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) + ylim(1, max(volcano_plot_data$log10qvalue)) +
geom_vline(xintercept=0) +
scale_color_manual(values = wes_palette(6, name="Darjeeling2", type="continuous"), limits = colnames(res_CARseq$p)) +
theme_minimal()
}
# non neurons
volcano_plot_data_non_neuronal = subset(volcano_plot_data, !(cell_type %in% c("Exc","Inh")))
if (nrow(volcano_plot_data_non_neuronal) >= 10) {
volcano_plot_data_non_neuronal$gene_name_subset[rank(-volcano_plot_data_non_neuronal$log10qvalue) > 10] = ""
ggplot(volcano_plot_data_non_neuronal, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0,height=0.0)) +
geom_hline(yintercept=1, linetype="dashed") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) + ylim(1, max(volcano_plot_data$log10qvalue)) +
geom_vline(xintercept=0) +
scale_color_manual(values = wes_palette(6, name="Darjeeling2", type="continuous"), limits = colnames(res_CARseq$p)) +
theme_minimal()
}volcano_plot_data = data.frame()
log10qvalue = -log10(CARseq:::get_qvalues_one_inflated(res_DESeq2$p[, "pvalue"]))
res_DESeq2$p$shrunken_LFC = res_DESeq2$p$log2FoldChange * log(2) # transform log2 to natural log
# only showing gene-cell type pairs with q value < 0.1 & shrunken lfc > 0.1 in abs value
log10qvalue_significant = which(log10qvalue > -log10(0.1)) # & abs(res_DESeq2$p$shrunken_LFC) > 0.1)
if (length(log10qvalue_significant) > 0) {
volcano_plot_data = rbind(volcano_plot_data,
data.frame(cell_type = "DESeq2 bulk",
gene_name = rowData(rse_filtered)$gene_name[log10qvalue_significant],
log10qvalue = log10qvalue[log10qvalue_significant],
shrunken_LFC = res_DESeq2$p$shrunken_LFC[log10qvalue_significant]
))
}
dim(volcano_plot_data)## [1] 1009 4
# remove the LFCs that are not finite
volcano_plot_data = volcano_plot_data[is.finite(volcano_plot_data$shrunken_LFC), ]
dim(volcano_plot_data)## [1] 1009 4
# show gene names on the plot when q value < 0.01
volcano_plot_data$gene_name_subset = as.character(volcano_plot_data$gene_name)
volcano_plot_data$gene_name_subset[rank(-volcano_plot_data$log10qvalue) > 10] = ""
ggplot(volcano_plot_data, aes(x=shrunken_LFC, y=log10qvalue, label=gene_name_subset, color=cell_type)) +
# geom_point(alpha = volcano_plot_data$log10qvalue * 0.1) +
geom_point(alpha = 0.5) +
geom_text_repel(hjust="outward", vjust="outward", position=position_jitter(width=0, height=0)) +
geom_hline(yintercept=1, linetype="dashed") +
xlim(-max(abs(volcano_plot_data$shrunken_LFC)), max(abs(volcano_plot_data$shrunken_LFC))) + ylim(1, max(volcano_plot_data$log10qvalue)) +
geom_vline(xintercept=0) +
theme_minimal()library(gplots)
scz_pvalues = data.frame(CARseq=res_CARseq$p, DESeq2.bulk=res_DESeq2$p$pvalue)
scz_qvalues = apply(scz_pvalues, 2, CARseq:::get_qvalues_one_inflated)
scz_significant = (scz_qvalues < 0.1)
scz_significant[is.na(scz_significant)] = FALSE
opar = par(cex = 0.7)
venn(as.data.frame(scz_significant[,
c("CARseq.Micro", "CARseq.Oligo", "DESeq2.bulk")]))# if (!file.exists("figures/Figure_scz_unedited.pdf")) {
pdf("figures/Figure_scz_unedited.pdf", height=9, width=10)
g_scz_1 = ggarrange(g_scz_1_SCZ,
g_scz_1_Control + ylab(""),
ncol = 2, widths = c(3.9, 6.1), nrow = 1)
g_scz_123 = ggarrange(g_scz_1,
g_scz_2 + coord_flip() +
theme(legend.position="bottom",
),
g_scz_3 +
theme(legend.position = "none",
plot.margin = margin(t = .0, r = .2, b = .2, l = .6, unit = "in")),
nrow = 1, ncol = 3, widths = c(4, 2.5, 4.5))
ggarrange(g_scz_123,
g_scz_4[[4]],
nrow = 2, ncol = 1)
dev.off()## png
## 2
# if (!file.exists("figures/Figure_asd_unedited.pdf")) {
pdf("figures/Figure_asd_unedited.pdf", height=9, width=10)
g_asd_1 = ggarrange(g_asd_1_ASD,
g_asd_1_Control + ylab(""),
ncol = 2, widths = c(3.9, 6.1), nrow = 1)
g_asd_123 = ggarrange(g_asd_1,
g_asd_2 + coord_flip() +
theme(legend.position="bottom",
),
g_asd_3 +
theme(legend.position = "none",
plot.margin = margin(t = .0, r = .2, b = .2, l = .6, unit = "in")),
nrow = 1, ncol = 3, widths = c(4, 2.5, 4.5))
ggarrange(g_asd_123,
g_asd_4[[4]],
nrow = 2, ncol = 1)
dev.off()## png
## 2
library(ggpointdensity)
# ASD
library(readxl)
table_from_Gandal = suppressWarnings(read_excel("data/aat8127_Table_S1.xlsx", sheet = "DGE"))
res_DESeq2_unshrunken = list()
res_DESeq2_unshrunken$p = read.table(file.path(autism_working_dir, "results/ASD_step1_DESeq2_bulk_adj_covariates_seqSV4.txt"))
# ENSG00000000003.10 |-> ENSG00000000003
rownames(res_DESeq2_unshrunken$p) = sub("\\..*", "", rownames(res_DESeq2_unshrunken$p))
gene_ids = intersect(rownames(res_DESeq2_unshrunken$p), table_from_Gandal$ensembl_gene_id)
ASD_DESeq2_log2FC_from_Gandal = table_from_Gandal$ASD.log2FC[match(gene_ids, table_from_Gandal$ensembl_gene_id)]
ASD_DESeq2_log2FC_reproduced = res_DESeq2_unshrunken$p[gene_ids, "log2FoldChange"]
# plot(ASD_DESeq2_log2FC_from_Gandal,
# ASD_DESeq2_log2FC_reproduced,
# cex = 0.5, col=rgb(0,0,0,alpha=0.5), pch=19, xlim=c(-1.5, 3.5), ylim=c(-1.5, 3.5))
# abline(a = 0, b = 1, col="red")
# SCZ
library(readxl)
table_from_Gandal = suppressWarnings(read_excel("data/aat8127_Table_S1.xlsx", sheet = "DGE"))
res_DESeq2_unshrunken = list()
res_DESeq2_unshrunken$p = read.table(file.path(autism_working_dir, "results/SCZ_step1_DESeq2_bulk_adj_covariates_SV2.txt"))
# ENSG00000000003.10 |-> ENSG00000000003
rownames(res_DESeq2_unshrunken$p) = sub("\\..*", "", rownames(res_DESeq2_unshrunken$p))
gene_ids = intersect(rownames(res_DESeq2_unshrunken$p), table_from_Gandal$ensembl_gene_id)
SCZ_DESeq2_log2FC_from_Gandal = table_from_Gandal$SCZ.log2FC[match(gene_ids, table_from_Gandal$ensembl_gene_id)]
SCZ_DESeq2_log2FC_reproduced = res_DESeq2_unshrunken$p[gene_ids, "log2FoldChange"]
# plot(SCZ_DESeq2_log2FC_from_Gandal,
# SCZ_DESeq2_log2FC_reproduced,
# cex = 0.5, col=rgb(0,0,0,alpha=0.5), pch=19, xlim=c(-1, 1), ylim=c(-1, 1))
# abline(a = 0, b = 1, col="red")
SCZ_DESeq2_log2FC = data.frame(DESeq2 = SCZ_DESeq2_log2FC_reproduced, Gandal_et_al = SCZ_DESeq2_log2FC_from_Gandal)
ASD_DESeq2_log2FC = data.frame(DESeq2 = ASD_DESeq2_log2FC_reproduced, Gandal_et_al = ASD_DESeq2_log2FC_from_Gandal)
gs1 = ggplot(SCZ_DESeq2_log2FC,aes(x=DESeq2,y=Gandal_et_al)) +
geom_abline(intercept = 0, slope =1) +
geom_pointdensity(size = 0.6) + scale_color_viridis_c() +
ggtitle("SCZ")
gs2 = ggplot(ASD_DESeq2_log2FC,aes(x=DESeq2,y=Gandal_et_al)) +
geom_abline(intercept = 0, slope =1) +
geom_pointdensity(size = 0.6) + scale_color_viridis_c() +
ggtitle("ASD")
ggarrange(gs1, gs2, ncol = 2, nrow = 1, common.legend = TRUE)## Warning: Removed 1 rows containing non-finite values (stat_pointdensity).
Note that there is a considerable proportion of Adult-OtherNeuron in Wang et al.
library(readxl)
table_from_Wang = read_excel("data/DER-24_Cell_fractions_Normalized.xlsx")
table_from_Wang$CellType## [1] "Adult-Ex1" "Adult-Ex2" "Adult-Ex3"
## [4] "Adult-Ex4" "Adult-Ex5" "Adult-Ex6"
## [7] "Adult-Ex7" "Adult-Ex8" "Adult-In1"
## [10] "Adult-In2" "Adult-In3" "Adult-In4"
## [13] "Adult-In5" "Adult-In6" "Adult-In7"
## [16] "Adult-In8" "Adult-Astrocytes" "Adult-Endothelial"
## [19] "Dev-Quiescent" "Dev-Replicating" "Adult-Microglia"
## [22] "Adult-OtherNeuron" "Adult-OPC" "Adult-Oligo"
matrix_Wang = as.data.frame(t(as.matrix(table_from_Wang[, -1])))
colnames(matrix_Wang) = table_from_Wang$CellType
matrix_Wang$Astro = rowSums(matrix_Wang[, "Adult-Astrocytes", drop=FALSE])
matrix_Wang$Exc = rowSums(matrix_Wang[, grep("Ex", table_from_Wang$CellType)])
matrix_Wang$Inh = rowSums(matrix_Wang[, grep("In", table_from_Wang$CellType)])
matrix_Wang$Micro = rowSums(matrix_Wang[, "Adult-Microglia", drop=FALSE])
matrix_Wang$Oligo = rowSums(matrix_Wang[, "Adult-Oligo", drop=FALSE])
matrix_Wang$OPC = rowSums(matrix_Wang[, "Adult-OPC", drop=FALSE])
# read CMC SCZ cellular proportions that we have inferred using ICeDT and CIBERSORT
cellular_proportions = readRDS(file.path(scz_working_dir, "data/SCZ_prop.rds"))
rse_filtered = readRDS(file.path(scz_working_dir, "data/rse_filtered_SV.rds"))
cellular_proportions$ICeDT = cellular_proportions$ICeDT[colnames(rse_filtered), ]
cellular_proportions$CIBERSORT = cellular_proportions$CIBERSORT[colnames(rse_filtered), ]
clinical_data = read.csv("data/Release3_SampleID_key.csv", as.is = TRUE)
matched_RNAseq_ID = clinical_data$RNAseq.Sample_RNA_ID[match(rownames(matrix_Wang), clinical_data$Individual_ID)]
matrix_Wang = matrix_Wang[!is.na(matched_RNAseq_ID), ]
rownames(matrix_Wang) = matched_RNAseq_ID[!is.na(matched_RNAseq_ID)]
matrix_Wang_subset = matrix_Wang[colnames(rse_filtered), colnames(cellular_proportions$ICeDT)]
dim(matrix_Wang_subset)## [1] 527 6
## [1] 518 6
matrix_Wang_subset$diagnosis = colData(rse_filtered)$Project.ID
# opar = par(mfrow=c(2,3), cex.main = 0.75)
# for (cell_type in colnames(cellular_proportions$ICeDT)) {
# plot(matrix_Wang_subset[, cell_type],
# cellular_proportions$ICeDT[, cell_type],
# main = paste0(cell_type, " - ICeDT est. vs Wang Suppl. Table"),
# xlab = "Wang Suppl. Table", ylab = "ICeDT")
# abline(a = 0, b = 1)
# }
#
# for (cell_type in colnames(cellular_proportions$ICeDT)) {
# plot(matrix_Wang_subset[, cell_type],
# cellular_proportions$CIBERSORT[, cell_type],
# main = paste0(cell_type, " - CIBERSORT est. vs. Wang Suppl. Table"),
# xlab = "Wang Suppl. Table", ylab = "CIBERSORT")
# abline(a = 0, b = 1)
# }
# par(opar)# ICeDT
rho_df = merge(matrix_Wang_subset, cellular_proportions$ICeDT, by="row.names",
suffixes=c(".Wang_et_al", ".ICeDT"))
dim(rho_df)## [1] 518 14
## Row.names Astro.Wang_et_al Exc.Wang_et_al Inh.Wang_et_al
## 1 MSSM_RNA_PFC_1 0.23 0.27 0.033
## 2 MSSM_RNA_PFC_10 0.23 0.25 0.013
## Micro.Wang_et_al Oligo.Wang_et_al OPC.Wang_et_al diagnosis Astro.ICeDT
## 1 0.086 0.23 0 SCZ 0.15
## 2 0.063 0.22 0 Control 0.14
## Exc.ICeDT Inh.ICeDT Micro.ICeDT Oligo.ICeDT OPC.ICeDT
## 1 0.43 0.087 0.027 0.24 0.065
## 2 0.51 0.129 0.020 0.16 0.042
## Astro.Wang_et_al Exc.Wang_et_al Inh.Wang_et_al Micro.Wang_et_al
## Min. :0.08 Min. :0.21 Min. :0.000 Min. :0.000
## 1st Qu.:0.22 1st Qu.:0.28 1st Qu.:0.000 1st Qu.:0.042
## Median :0.24 Median :0.30 Median :0.010 Median :0.060
## Mean :0.24 Mean :0.30 Mean :0.016 Mean :0.064
## 3rd Qu.:0.26 3rd Qu.:0.33 3rd Qu.:0.025 3rd Qu.:0.086
## Max. :0.38 Max. :0.42 Max. :0.074 Max. :0.173
## Oligo.Wang_et_al OPC.Wang_et_al diagnosis Astro.ICeDT
## Min. :0.000 Min. :0.000 Length:518 Min. :0.02
## 1st Qu.:0.116 1st Qu.:0.000 Class :character 1st Qu.:0.12
## Median :0.143 Median :0.000 Mode :character Median :0.14
## Mean :0.143 Mean :0.001 Mean :0.15
## 3rd Qu.:0.172 3rd Qu.:0.000 3rd Qu.:0.16
## Max. :0.268 Max. :0.038 Max. :0.42
## Exc.ICeDT Inh.ICeDT Micro.ICeDT Oligo.ICeDT
## Min. :0.30 Min. :0.021 Min. :0.005 Min. :0.018
## 1st Qu.:0.55 1st Qu.:0.087 1st Qu.:0.014 1st Qu.:0.053
## Median :0.60 Median :0.110 Median :0.018 Median :0.077
## Mean :0.59 Mean :0.108 Mean :0.021 Mean :0.087
## 3rd Qu.:0.64 3rd Qu.:0.133 3rd Qu.:0.026 3rd Qu.:0.113
## Max. :0.75 Max. :0.191 Max. :0.087 Max. :0.308
## OPC.ICeDT
## Min. :0.006
## 1st Qu.:0.032
## Median :0.043
## Mean :0.046
## 3rd Qu.:0.054
## Max. :0.129
g2 = list()
g2[[1]] = ggplot(rho_df, aes(x=Astro.ICeDT, y=Astro.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "ICeDT", y="Wang_et_al", title="Astro")
g2[[2]] = ggplot(rho_df, aes(x=Exc.ICeDT, y=Exc.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "ICeDT", y="Wang_et_al", title="Exc")
g2[[3]] = ggplot(rho_df, aes(x=Inh.ICeDT, y=Inh.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "ICeDT", y="Wang_et_al", title="Inh")
g2[[4]] = ggplot(rho_df, aes(x=Micro.ICeDT, y=Micro.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "ICeDT", y="Wang_et_al", title="Micro")
g2[[5]] = ggplot(rho_df, aes(x=Oligo.ICeDT, y=Oligo.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "ICeDT", y="Wang_et_al", title="Oligo")
g2[[6]] = ggplot(rho_df, aes(x=OPC.ICeDT, y=OPC.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "ICeDT", y="Wang_et_al", title="OPC")
ggarrange(g2[[1]], g2[[2]], g2[[3]], g2[[4]], g2[[5]], g2[[6]],
ncol = 3, nrow = 2, common.legend = TRUE)# CIBERSORT
rho_df = merge(matrix_Wang_subset, cellular_proportions$CIBERSORT, by="row.names",
suffixes=c(".Wang_et_al", ".CIBERSORT"))
dim(rho_df)## [1] 518 14
## Row.names Astro.Wang_et_al Exc.Wang_et_al Inh.Wang_et_al
## 1 MSSM_RNA_PFC_1 0.23 0.27 0.033
## 2 MSSM_RNA_PFC_10 0.23 0.25 0.013
## Micro.Wang_et_al Oligo.Wang_et_al OPC.Wang_et_al diagnosis
## 1 0.086 0.23 0 SCZ
## 2 0.063 0.22 0 Control
## Astro.CIBERSORT Exc.CIBERSORT Inh.CIBERSORT Micro.CIBERSORT
## 1 0.075 0.61 0.0084 0.0094
## 2 0.072 0.76 0.0190 0.0111
## Oligo.CIBERSORT OPC.CIBERSORT
## 1 0.30 0
## 2 0.14 0
## Astro.Wang_et_al Exc.Wang_et_al Inh.Wang_et_al Micro.Wang_et_al
## Min. :0.08 Min. :0.21 Min. :0.000 Min. :0.000
## 1st Qu.:0.22 1st Qu.:0.28 1st Qu.:0.000 1st Qu.:0.042
## Median :0.24 Median :0.30 Median :0.010 Median :0.060
## Mean :0.24 Mean :0.30 Mean :0.016 Mean :0.064
## 3rd Qu.:0.26 3rd Qu.:0.33 3rd Qu.:0.025 3rd Qu.:0.086
## Max. :0.38 Max. :0.42 Max. :0.074 Max. :0.173
## Oligo.Wang_et_al OPC.Wang_et_al diagnosis Astro.CIBERSORT
## Min. :0.000 Min. :0.000 Length:518 Min. :0.00
## 1st Qu.:0.116 1st Qu.:0.000 Class :character 1st Qu.:0.07
## Median :0.143 Median :0.000 Mode :character Median :0.09
## Mean :0.143 Mean :0.001 Mean :0.10
## 3rd Qu.:0.172 3rd Qu.:0.000 3rd Qu.:0.11
## Max. :0.268 Max. :0.038 Max. :0.49
## Exc.CIBERSORT Inh.CIBERSORT Micro.CIBERSORT Oligo.CIBERSORT
## Min. :0.35 Min. :0.000 Min. :0.000 Min. :0.01
## 1st Qu.:0.75 1st Qu.:0.008 1st Qu.:0.000 1st Qu.:0.05
## Median :0.80 Median :0.016 Median :0.004 Median :0.07
## Mean :0.78 Mean :0.017 Mean :0.007 Mean :0.09
## 3rd Qu.:0.85 3rd Qu.:0.023 3rd Qu.:0.010 3rd Qu.:0.12
## Max. :0.96 Max. :0.066 Max. :0.059 Max. :0.41
## OPC.CIBERSORT
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.001
## 3rd Qu.:0.000
## Max. :0.050
g2 = list()
g2[[1]] = ggplot(rho_df, aes(x=Astro.CIBERSORT, y=Astro.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "CIBERSORT", y="Wang_et_al", title="Astro")
g2[[2]] = ggplot(rho_df, aes(x=Exc.CIBERSORT, y=Exc.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "CIBERSORT", y="Wang_et_al", title="Exc")
g2[[3]] = ggplot(rho_df, aes(x=Inh.CIBERSORT, y=Inh.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "CIBERSORT", y="Wang_et_al", title="Inh")
g2[[4]] = ggplot(rho_df, aes(x=Micro.CIBERSORT, y=Micro.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "CIBERSORT", y="Wang_et_al", title="Micro")
g2[[5]] = ggplot(rho_df, aes(x=Oligo.CIBERSORT, y=Oligo.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "CIBERSORT", y="Wang_et_al", title="Oligo")
g2[[6]] = ggplot(rho_df, aes(x=OPC.CIBERSORT, y=OPC.Wang_et_al, col=diagnosis)) +
geom_point(alpha=0.5) + geom_abline(intercept = 0, slope =1) +
labs(x = "CIBERSORT", y="Wang_et_al", title="OPC")
ggarrange(g2[[1]], g2[[2]], g2[[3]], g2[[4]], g2[[5]], g2[[6]],
ncol = 3, nrow = 2, common.legend = TRUE)# library(wesanderson)
cellular_proportions_longtable =
matrix_Wang_subset[, 1:6] %>% as.data.frame() %>%
rownames_to_column(var = "samples") %>%
add_column(group = colData(rse_filtered)$Project.ID) %>%
add_column(Exc_prop = cellular_proportions$ICeDT[,"Exc"]) %>%
pivot_longer(cols = Astro:OPC, names_to = "cell type", values_to = "cellular proportions")
# There are three centers: MSSM, Penn, and Pitt.
colData(rse_filtered)$Institution = "MSSM"
colData(rse_filtered)$Institution[colData(rse_filtered)$InstitutionPenn == 1] = "Penn"
colData(rse_filtered)$Institution[colData(rse_filtered)$InstitutionPitt == 1] = "Pitt"
opar = par(cex.axis = 0.7)
boxplot(log_depth~Institution+Project.ID, colData(rse_filtered))par(opar)
g_scz_1_SCZ = ggplot(data=cellular_proportions_longtable %>% subset(group %in% "SCZ"), aes(x=reorder(samples, Exc_prop), y=`cellular proportions`, fill=`cell type`)) +
geom_bar(stat = "identity", width=1) +
ggtitle("SCZ samples") +
xlab("sample") +
scale_fill_manual(values = wes_palette(6, name="Darjeeling2", type="continuous")) +
theme_minimal() +
theme(panel.grid.major = element_blank(), axis.text.x = element_blank(), legend.position = "none")
g_scz_1_Control = ggplot(data=cellular_proportions_longtable %>% subset(group %in% "Control"), aes(x=reorder(samples, Exc_prop), y=`cellular proportions`, fill=`cell type`)) +
geom_bar(stat = "identity", width=1) +
ggtitle("Control samples") +
xlab("sample") +
scale_fill_manual(values = wes_palette(6, name="Darjeeling2", type="continuous")) +
theme_minimal() +
theme(panel.grid.major = element_blank(), axis.text.x = element_blank())
g_scz_1 = ggarrange(g_scz_1_SCZ, g_scz_1_Control, ncol = 2, widths = c(4,5), nrow = 1)## Warning: Removed 54 rows containing missing values (position_stack).
# log ratio against Exc
summary_Wang_et_al = list()
summary_Wang_et_al[["Astro"]]=summary(lm(log((1e-2 + matrix_Wang_subset[,"Astro"] / (1e-2 + matrix_Wang_subset[,"Exc"])))~ Project.ID + log_depth + Institution +
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_Wang_et_al[["Astro"]]##
## Call:
## lm(formula = log((0.01 + matrix_Wang_subset[, "Astro"]/(0.01 +
## matrix_Wang_subset[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3048 -0.0767 0.0036 0.0816 0.4532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09181 0.23248 0.39 0.6931
## Project.IDSCZ 0.01337 0.01466 0.91 0.3620
## log_depth -0.04859 0.03434 -1.42 0.1577
## InstitutionPenn 0.01524 0.02274 0.67 0.5030
## InstitutionPitt -0.05428 0.02248 -2.41 0.0161 *
## genderMale 0.05241 0.01470 3.56 0.0004 ***
## scaled_age_death 0.02113 0.00965 2.19 0.0290 *
## scaled_PMI -0.00307 0.00751 -0.41 0.6828
## scaled_RIN -0.85098 0.11122 -7.65 1.0e-13 ***
## scaled_RIN2 0.84101 0.11329 7.42 5.0e-13 ***
## genoPC1 -0.01411 0.00679 -2.08 0.0383 *
## genoPC2 -0.00290 0.00731 -0.40 0.6922
## sv1 -0.07805 0.01528 -5.11 4.7e-07 ***
## sv2 -0.03540 0.01085 -3.26 0.0012 **
## libclustB -0.03806 0.02531 -1.50 0.1332
## libclustbase -0.18134 0.04224 -4.29 2.1e-05 ***
## libclustC -0.02260 0.02925 -0.77 0.4400
## libclustD -0.01906 0.02644 -0.72 0.4715
## libclustE -0.03413 0.02691 -1.27 0.2052
## libclustF -0.05670 0.02831 -2.00 0.0457 *
## libclustG -0.09784 0.03658 -2.67 0.0077 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.15 on 497 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.263, Adjusted R-squared: 0.233
## F-statistic: 8.85 on 20 and 497 DF, p-value: <2e-16
summary_Wang_et_al[["Inh"]]=summary(lm(log((1e-2 + matrix_Wang_subset[,"Inh"] / (1e-2 + matrix_Wang_subset[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_Wang_et_al[["Inh"]]##
## Call:
## lm(formula = log((0.01 + matrix_Wang_subset[, "Inh"]/(0.01 +
## matrix_Wang_subset[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.532 -0.665 0.133 0.655 1.984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.320655 1.354157 -3.93 9.7e-05 ***
## Project.IDSCZ 0.238390 0.085366 2.79 0.00543 **
## log_depth 0.354485 0.200003 1.77 0.07694 .
## InstitutionPenn -0.494827 0.132468 -3.74 0.00021 ***
## InstitutionPitt -0.035410 0.130932 -0.27 0.78693
## genderMale -0.743932 0.085643 -8.69 < 2e-16 ***
## scaled_age_death 0.017500 0.056204 0.31 0.75565
## scaled_PMI -0.030579 0.043740 -0.70 0.48481
## scaled_RIN -2.227408 0.647849 -3.44 0.00063 ***
## scaled_RIN2 1.970580 0.659895 2.99 0.00296 **
## genoPC1 -0.015540 0.039568 -0.39 0.69467
## genoPC2 -0.039325 0.042593 -0.92 0.35631
## sv1 0.152619 0.089009 1.71 0.08703 .
## sv2 0.125326 0.063224 1.98 0.04800 *
## libclustB 0.045194 0.147419 0.31 0.75930
## libclustbase 0.129447 0.246055 0.53 0.59906
## libclustC -0.000982 0.170350 -0.01 0.99540
## libclustD 0.322413 0.154027 2.09 0.03684 *
## libclustE 0.073273 0.156727 0.47 0.64033
## libclustF 0.005244 0.164874 0.03 0.97464
## libclustG 0.127532 0.213098 0.60 0.54980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.87 on 497 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.317, Adjusted R-squared: 0.29
## F-statistic: 11.5 on 20 and 497 DF, p-value: <2e-16
summary_Wang_et_al[["Micro"]]=summary(lm(log((1e-2 + matrix_Wang_subset[,"Micro"] / (1e-2 + matrix_Wang_subset[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_Wang_et_al[["Micro"]]##
## Call:
## lm(formula = log((0.01 + matrix_Wang_subset[, "Micro"]/(0.01 +
## matrix_Wang_subset[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.233 -0.227 0.102 0.333 1.788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.98354 1.03115 -0.95 0.34063
## Project.IDSCZ -0.01851 0.06500 -0.28 0.77589
## log_depth -0.07279 0.15230 -0.48 0.63288
## InstitutionPenn 0.31711 0.10087 3.14 0.00177 **
## InstitutionPitt -0.68642 0.09970 -6.88 1.8e-11 ***
## genderMale 0.00871 0.06521 0.13 0.89382
## scaled_age_death 0.23596 0.04280 5.51 5.7e-08 ***
## scaled_PMI -0.13852 0.03331 -4.16 3.8e-05 ***
## scaled_RIN -0.64741 0.49332 -1.31 0.19000
## scaled_RIN2 0.71224 0.50249 1.42 0.15699
## genoPC1 0.01172 0.03013 0.39 0.69741
## genoPC2 0.04429 0.03243 1.37 0.17271
## sv1 -0.25989 0.06778 -3.83 0.00014 ***
## sv2 -0.18575 0.04814 -3.86 0.00013 ***
## libclustB -0.05874 0.11226 -0.52 0.60100
## libclustbase -0.13618 0.18736 -0.73 0.46767
## libclustC -0.25900 0.12972 -2.00 0.04641 *
## libclustD -0.04967 0.11729 -0.42 0.67215
## libclustE -0.07319 0.11934 -0.61 0.53999
## libclustF -0.24438 0.12555 -1.95 0.05216 .
## libclustG -0.49272 0.16227 -3.04 0.00252 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.66 on 497 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.306, Adjusted R-squared: 0.278
## F-statistic: 11 on 20 and 497 DF, p-value: <2e-16
summary_Wang_et_al[["Oligo"]]=summary(lm(log((1e-2 + matrix_Wang_subset[,"Oligo"] / (1e-2 + matrix_Wang_subset[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_Wang_et_al[["Oligo"]]##
## Call:
## lm(formula = log((0.01 + matrix_Wang_subset[, "Oligo"]/(0.01 +
## matrix_Wang_subset[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.282 -0.167 0.030 0.215 0.885
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.948138 0.555791 -1.71 0.0886 .
## Project.IDSCZ -0.071760 0.035037 -2.05 0.0411 *
## log_depth 0.042714 0.082088 0.52 0.6031
## InstitutionPenn -0.013521 0.054369 -0.25 0.8037
## InstitutionPitt -0.383027 0.053739 -7.13 3.6e-12 ***
## genderMale -0.033729 0.035151 -0.96 0.3377
## scaled_age_death 0.070096 0.023068 3.04 0.0025 **
## scaled_PMI -0.031557 0.017952 -1.76 0.0794 .
## scaled_RIN -0.378598 0.265899 -1.42 0.1551
## scaled_RIN2 0.365584 0.270843 1.35 0.1777
## genoPC1 -0.040184 0.016240 -2.47 0.0137 *
## genoPC2 0.000287 0.017482 0.02 0.9869
## sv1 -0.262000 0.036532 -7.17 2.7e-12 ***
## sv2 0.064945 0.025949 2.50 0.0126 *
## libclustB -0.064392 0.060506 -1.06 0.2877
## libclustbase 0.039049 0.100989 0.39 0.6992
## libclustC 0.070300 0.069917 1.01 0.3152
## libclustD -0.038900 0.063218 -0.62 0.5386
## libclustE -0.029462 0.064326 -0.46 0.6471
## libclustF 0.110971 0.067670 1.64 0.1017
## libclustG -0.029675 0.087463 -0.34 0.7345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.36 on 497 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.358, Adjusted R-squared: 0.332
## F-statistic: 13.9 on 20 and 497 DF, p-value: <2e-16
summary_Wang_et_al[["OPC"]]=summary(lm(log((1e-2 + matrix_Wang_subset[,"OPC"] / (1e-2 + matrix_Wang_subset[,"Exc"])))~ Project.ID + log_depth + Institution+
genderMale + scaled_age_death + scaled_PMI + scaled_RIN + scaled_RIN2 +
genoPC1 + genoPC2 +
sv1 + sv2 +
libclustB + libclustbase + libclustC + libclustD + libclustE + libclustF + libclustG, data=colData(rse_filtered)))
summary_Wang_et_al[["OPC"]]##
## Call:
## lm(formula = log((0.01 + matrix_Wang_subset[, "OPC"]/(0.01 +
## matrix_Wang_subset[, "Exc"]))) ~ Project.ID + log_depth +
## Institution + genderMale + scaled_age_death + scaled_PMI +
## scaled_RIN + scaled_RIN2 + genoPC1 + genoPC2 + sv1 + sv2 +
## libclustB + libclustbase + libclustC + libclustD + libclustE +
## libclustF + libclustG, data = colData(rse_filtered))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5833 -0.1913 -0.0513 0.0725 2.0518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.63790 0.56324 -8.23 1.6e-15 ***
## Project.IDSCZ 0.01650 0.03551 0.46 0.64225
## log_depth 0.02926 0.08319 0.35 0.72519
## InstitutionPenn 0.10973 0.05510 1.99 0.04697 *
## InstitutionPitt 0.04303 0.05446 0.79 0.42985
## genderMale -0.20126 0.03562 -5.65 2.7e-08 ***
## scaled_age_death -0.02955 0.02338 -1.26 0.20681
## scaled_PMI -0.01777 0.01819 -0.98 0.32916
## scaled_RIN -0.73000 0.26946 -2.71 0.00698 **
## scaled_RIN2 0.73888 0.27447 2.69 0.00734 **
## genoPC1 -0.01772 0.01646 -1.08 0.28209
## genoPC2 -0.00215 0.01772 -0.12 0.90332
## sv1 0.14300 0.03702 3.86 0.00013 ***
## sv2 -0.00850 0.02630 -0.32 0.74652
## libclustB 0.05305 0.06132 0.87 0.38736
## libclustbase -0.00142 0.10234 -0.01 0.98896
## libclustC 0.01989 0.07085 0.28 0.77908
## libclustD 0.10829 0.06406 1.69 0.09159 .
## libclustE 0.07469 0.06519 1.15 0.25244
## libclustF -0.04485 0.06858 -0.65 0.51337
## libclustG -0.05526 0.08863 -0.62 0.53326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.36 on 497 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.209, Adjusted R-squared: 0.177
## F-statistic: 6.55 on 20 and 497 DF, p-value: 5.62e-16
# See http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
# collect summary of cellular fraction linear models
summary_table = NULL
for (cell_type in names(summary_Wang_et_al)) {
summary_table = rbind(summary_table,
data.frame(method="Wang_et_al",
cell_type=cell_type,
diagnosis_effect=summary_Wang_et_al[[cell_type]]$coefficients[2, "Estimate"],
se=summary_Wang_et_al[[cell_type]]$coefficients[2, "Std. Error"],
pval=summary_Wang_et_al[[cell_type]]$coefficients[2, "Pr(>|t|)"])
)
}
summary_table$method = factor(summary_table$method, levels=c("Wang_et_al"))
summary_table## method cell_type diagnosis_effect se pval
## 1 Wang_et_al Astro 0.013 0.015 0.3620
## 2 Wang_et_al Inh 0.238 0.085 0.0054
## 3 Wang_et_al Micro -0.019 0.065 0.7759
## 4 Wang_et_al Oligo -0.072 0.035 0.0411
## 5 Wang_et_al OPC 0.017 0.036 0.6423
# The errorbars overlapped, so use position_dodge to move them horizontally
pd = position_dodge(0.3) # move them to the left and right
summary_table$cell_type = factor(summary_table$cell_type, levels = rev(levels(summary_table$cell_type)))
g_asd_2 = ggplot(summary_table, aes(x=cell_type, y=diagnosis_effect, colour=method, group=method)) +
geom_errorbar(aes(ymin=diagnosis_effect-se, ymax=diagnosis_effect+se), colour="black", width=.1, position=pd) +
# geom_line(position=pd) +
geom_point(position=pd, size=3, shape=21, fill="white") + # 21 is filled circle
xlab("Cell types") +
ylab("Difference of log ratio between case and control") +
scale_colour_hue(name="Deconvolution method", # Legend label, use darker colors
breaks=c("Wang_et_al"),
labels=c("Wang_et_al"),
l=40) + # Use darker colors, lightness=40
ggtitle(" Cellular fraction estimates in Wang et al.\n on SCZ vs. Control\n adjusted for covariates") +
expand_limits(y=0) + # Expand y range
theme_bw() +
geom_hline(yintercept=0, linetype="dashed")
g_asd_2# goseq analysis
DE_genes_Oligo_table = read.csv("results/DEG_Oligo_in_both_SCZ_ASD.csv", as.is=TRUE)
dim(DE_genes_Oligo_table)## [1] 124 6
## gene_id gene_name CARseq_Oligo.SCZ CARseq_Oligo.ASD
## 1 ENSG00000119138 KLF9 1.6e-03 0.023
## 2 ENSG00000227741 RP11-536C5.7 3.1e-02 0.029
## 3 ENSG00000121579 NAA50 6.8e-04 0.029
## 4 ENSG00000127993 RBM48 2.3e-05 0.043
## 5 ENSG00000153201 RANBP2 1.8e-03 0.040
## 6 ENSG00000136816 TOR1B 7.6e-04 0.046
## SCZ_vs_Control.Oligo ASD_vs_Control.Oligo
## 1 -1.6 1.5
## 2 -1.5 -1.3
## 3 -1.5 0.9
## 4 -1.5 1.3
## 5 -1.5 1.1
## 6 -1.4 -1.1
names_DE_genes = unique(rowData(rse_filtered)$gene_name)
DE_genes = rep(0, length(names_DE_genes))
names(DE_genes) = names_DE_genes
DE_genes[DE_genes_Oligo_table$gene_name] = 1
cat(sprintf("# of DE genes = %d\n", sum(DE_genes)))## # of DE genes = 124
pathways_reactome_filtered = pathways_reactome[sapply(pathways_reactome, length) %in% 10:1000]
if (sum(DE_genes) > 0) {
# # alternatively, use gene_length data we have obtained from GTF file:
# pwf = nullp(DE_genes, "dummy", "dummy", bias.data=rowData(rse_filtered)$gene_length)
pwf = nullp(DE_genes, "hg19", "geneSymbol")
pvals = goseq(pwf, "hg19","geneSymbol", gene2cat = goseq:::reversemapping(pathways_reactome_filtered))
pvals$qval_over_represented = CARseq:::get_qvalues_one_inflated(pvals$over_represented_pvalue)
pvals$qval_under_represented = CARseq:::get_qvalues_one_inflated(pvals$under_represented_pvalue)
# empty set -- no significant pathways were found
print(pvals[pvals$qval_over_represented < 0.1 | pvals$qval_under_represented < 0.1, ])
} else {
cat("No DE genes have been found under the current cutoffs!\n")
}## Warning in library(): libraries '/usr/local/lib/R/site-library', '/usr/lib/
## R/site-library' contain no packages
## Loading hg19 length data...
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## For 12742 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## [1] category over_represented_pvalue
## [3] under_represented_pvalue numDEInCat
## [5] numInCat qval_over_represented
## [7] qval_under_represented
## <0 rows> (or 0-length row.names)
# goseq analysis
DE_genes_Oligo_table = read.csv("results/DEG_Oligo_in_both_SCZ_ASD.csv", as.is=TRUE)
dim(DE_genes_Oligo_table)## [1] 124 6
## gene_id gene_name CARseq_Oligo.SCZ CARseq_Oligo.ASD
## 1 ENSG00000119138 KLF9 1.6e-03 0.023
## 2 ENSG00000227741 RP11-536C5.7 3.1e-02 0.029
## 3 ENSG00000121579 NAA50 6.8e-04 0.029
## 4 ENSG00000127993 RBM48 2.3e-05 0.043
## 5 ENSG00000153201 RANBP2 1.8e-03 0.040
## 6 ENSG00000136816 TOR1B 7.6e-04 0.046
## SCZ_vs_Control.Oligo ASD_vs_Control.Oligo
## 1 -1.6 1.5
## 2 -1.5 -1.3
## 3 -1.5 0.9
## 4 -1.5 1.3
## 5 -1.5 1.1
## 6 -1.4 -1.1
names_DE_genes = unique(rowData(rse_filtered)$gene_name)
DE_genes = rep(0, length(names_DE_genes))
names(DE_genes) = names_DE_genes
DE_genes[DE_genes_Oligo_table$gene_name] = 1
cat(sprintf("# of DE genes = %d\n", sum(DE_genes)))## # of DE genes = 124
pathways_go_bp_filtered = pathways_go_bp[sapply(pathways_go_bp, length) %in% 10:1000]
if (sum(DE_genes) > 0) {
# # alternatively, use gene_length data we have obtained from GTF file:
# pwf = nullp(DE_genes, "dummy", "dummy", bias.data=rowData(rse_filtered)$gene_length)
pwf = nullp(DE_genes, "hg19", "geneSymbol")
pvals = goseq(pwf, "hg19","geneSymbol", gene2cat = goseq:::reversemapping(pathways_go_bp_filtered))
pvals$qval_over_represented = CARseq:::get_qvalues_one_inflated(pvals$over_represented_pvalue)
pvals$qval_under_represented = CARseq:::get_qvalues_one_inflated(pvals$under_represented_pvalue)
# empty set -- no significant pathways were found
print(pvals[pvals$qval_over_represented < 0.1 | pvals$qval_under_represented < 0.1, ])
} else {
cat("No DE genes have been found under the current cutoffs!\n")
}## Warning in library(): libraries '/usr/local/lib/R/site-library', '/usr/lib/
## R/site-library' contain no packages
## Loading hg19 length data...
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## For 8912 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## [1] category over_represented_pvalue
## [3] under_represented_pvalue numDEInCat
## [5] numInCat qval_over_represented
## [7] qval_under_represented
## <0 rows> (or 0-length row.names)
# goseq analysis
DE_genes_Micro_table = read.csv("results/DEG_Micro_in_both_SCZ_ASD.csv", as.is=TRUE)
dim(DE_genes_Micro_table)## [1] 65 6
## gene_id gene_name CARseq_Micro.SCZ CARseq_Micro.ASD
## 1 ENSG00000187984 ANKRD19P 0.0002 0.0395
## 2 ENSG00000225975 AC074138.3 0.0098 0.0185
## 3 ENSG00000078401 EDN1 0.0018 0.0075
## 4 ENSG00000160298 C21orf58 0.0248 0.0286
## 5 ENSG00000183801 OLFML1 0.0231 0.0121
## 6 ENSG00000100156 SLC16A8 0.0427 0.0191
## SCZ_vs_Control.Micro ASD_vs_Control.Micro
## 1 -1.9 -0.00077
## 2 -1.6 -0.00113
## 3 -1.4 -1.79444
## 4 -1.3 -1.82892
## 5 -1.3 -0.00213
## 6 -1.2 -1.78788
names_DE_genes = unique(rowData(rse_filtered)$gene_name)
DE_genes = rep(0, length(names_DE_genes))
names(DE_genes) = names_DE_genes
DE_genes[DE_genes_Micro_table$gene_name] = 1
cat(sprintf("# of DE genes = %d\n", sum(DE_genes)))## # of DE genes = 65
pathways_reactome_filtered = pathways_reactome[sapply(pathways_reactome, length) %in% 10:1000]
if (sum(DE_genes) > 0) {
# # alternatively, use gene_length data we have obtained from GTF file:
# pwf = nullp(DE_genes, "dummy", "dummy", bias.data=rowData(rse_filtered)$gene_length)
pwf = nullp(DE_genes, "hg19", "geneSymbol")
pvals = goseq(pwf, "hg19","geneSymbol", gene2cat = goseq:::reversemapping(pathways_reactome_filtered))
pvals$qval_over_represented = CARseq:::get_qvalues_one_inflated(pvals$over_represented_pvalue)
pvals$qval_under_represented = CARseq:::get_qvalues_one_inflated(pvals$under_represented_pvalue)
significant_goseq_results = pvals[pvals$qval_over_represented < 0.1 | pvals$qval_under_represented < 0.1, ]
print(significant_goseq_results)
write.csv(significant_goseq_results, "results/DEG_Micro_in_both_SCZ_ASD_goseq_REACTOME.csv", quote = FALSE)
} else {
cat("No DE genes have been found under the current cutoffs!\n")
}## Warning in library(): libraries '/usr/local/lib/R/site-library', '/usr/lib/
## R/site-library' contain no packages
## Loading hg19 length data...
## Using manually entered categories.
## For 12742 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## category
## 329 REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION
## 892 REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY
## 948 REACTOME_SELENOAMINO_ACID_METABOLISM
## 1038 REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE
## 677 REACTOME_NONSENSE_MEDIATED_DECAY_NMD
## 330 REACTOME_EUKARYOTIC_TRANSLATION_INITIATION
## 150 REACTOME_CELLULAR_RESPONSES_TO_EXTERNAL_STIMULI
## 479 REACTOME_INFLUENZA_INFECTION
## 491 REACTOME_INTEGRIN_CELL_SURFACE_INTERACTIONS
## 843 REACTOME_REGULATION_OF_EXPRESSION_OF_SLITS_AND_ROBOS
## 334 REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION
## 932 REACTOME_RRNA_PROCESSING
## 1016 REACTOME_SIGNALING_BY_ROBO_RECEPTORS
## 345 REACTOME_FCGR_ACTIVATION
## over_represented_pvalue under_represented_pvalue numDEInCat numInCat
## 329 1.7e-05 1 6 90
## 892 2.5e-05 1 6 98
## 948 3.7e-05 1 6 105
## 1038 4.5e-05 1 6 109
## 677 4.6e-05 1 6 113
## 330 6.0e-05 1 6 117
## 150 1.7e-04 1 10 487
## 479 1.8e-04 1 6 149
## 491 2.7e-04 1 4 73
## 843 3.0e-04 1 6 160
## 334 5.7e-04 1 6 237
## 932 6.2e-04 1 6 188
## 1016 8.6e-04 1 6 204
## 345 1.3e-03 1 2 11
## qval_over_represented qval_under_represented
## 329 0.0098 1
## 892 0.0098 1
## 948 0.0098 1
## 1038 0.0098 1
## 677 0.0098 1
## 330 0.0107 1
## 150 0.0236 1
## 479 0.0236 1
## 491 0.0319 1
## 843 0.0322 1
## 334 0.0548 1
## 932 0.0548 1
## 1016 0.0708 1
## 345 0.0999 1
# goseq analysis
DE_genes_Micro_table = read.csv("results/DEG_Micro_in_both_SCZ_ASD.csv", as.is=TRUE)
dim(DE_genes_Micro_table)## [1] 65 6
## gene_id gene_name CARseq_Micro.SCZ CARseq_Micro.ASD
## 1 ENSG00000187984 ANKRD19P 0.0002 0.0395
## 2 ENSG00000225975 AC074138.3 0.0098 0.0185
## 3 ENSG00000078401 EDN1 0.0018 0.0075
## 4 ENSG00000160298 C21orf58 0.0248 0.0286
## 5 ENSG00000183801 OLFML1 0.0231 0.0121
## 6 ENSG00000100156 SLC16A8 0.0427 0.0191
## SCZ_vs_Control.Micro ASD_vs_Control.Micro
## 1 -1.9 -0.00077
## 2 -1.6 -0.00113
## 3 -1.4 -1.79444
## 4 -1.3 -1.82892
## 5 -1.3 -0.00213
## 6 -1.2 -1.78788
names_DE_genes = unique(rowData(rse_filtered)$gene_name)
DE_genes = rep(0, length(names_DE_genes))
names(DE_genes) = names_DE_genes
DE_genes[DE_genes_Micro_table$gene_name] = 1
cat(sprintf("# of DE genes = %d\n", sum(DE_genes)))## # of DE genes = 65
pathways_go_bp_filtered = pathways_go_bp[sapply(pathways_go_bp, length) %in% 10:1000]
if (sum(DE_genes) > 0) {
# # alternatively, use gene_length data we have obtained from GTF file:
# pwf = nullp(DE_genes, "dummy", "dummy", bias.data=rowData(rse_filtered)$gene_length)
pwf = nullp(DE_genes, "hg19", "geneSymbol")
pvals = goseq(pwf, "hg19","geneSymbol", gene2cat = goseq:::reversemapping(pathways_go_bp_filtered))
pvals$qval_over_represented = CARseq:::get_qvalues_one_inflated(pvals$over_represented_pvalue)
pvals$qval_under_represented = CARseq:::get_qvalues_one_inflated(pvals$under_represented_pvalue)
significant_goseq_results = pvals[pvals$qval_over_represented < 0.1 | pvals$qval_under_represented < 0.1, ]
print(significant_goseq_results)
write.csv(significant_goseq_results, "results/DEG_Micro_in_both_SCZ_ASD_goseq_GO_BP.csv", quote = FALSE)
} else {
cat("No DE genes have been found under the current cutoffs!\n")
}## Warning in library(): libraries '/usr/local/lib/R/site-library', '/usr/lib/
## R/site-library' contain no packages
## Loading hg19 length data...
## Using manually entered categories.
## For 8912 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## category
## 728 GO_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE
## 1053 GO_ESTABLISHMENT_OF_PROTEIN_LOCALIZATION_TO_ENDOPLASMIC_RETICULUM
## 2629 GO_NUCLEAR_TRANSCRIBED_MRNA_CATABOLIC_PROCESS_NONSENSE_MEDIATED_DECAY
## 3211 GO_POSITIVE_REGULATION_OF_LIPID_TRANSPORT
## 3654 GO_PROTEIN_LOCALIZATION_TO_ENDOPLASMIC_RETICULUM
## over_represented_pvalue under_represented_pvalue numDEInCat numInCat
## 728 1.6e-05 1 6 98
## 1053 2.8e-05 1 6 110
## 2629 3.7e-05 1 6 119
## 3211 4.5e-05 1 4 43
## 3654 7.0e-05 1 6 134
## qval_over_represented qval_under_represented
## 728 0.050 1
## 1053 0.050 1
## 2629 0.050 1
## 3211 0.050 1
## 3654 0.061 1